Esempi in Python per move_model_to_device, esempi in Python per utils_nlp.common.pytorch_utils.move_model_to_device

Esempio n. 1

0

Mostra file

def test_move_to_device_gpu(model):
    # test when device.type="cuda"
    model_cuda = move_model_to_device(model, torch.device("cuda"))
    num_cuda_devices = torch.cuda.device_count()

    if num_cuda_devices > 1:
        assert isinstance(model_cuda, DataParallel)
    else:
        assert isinstance(model_cuda, Sequential)

    model_cuda_1_gpu = move_model_to_device(model,
                                            torch.device("cuda"),
                                            num_gpus=1)
    assert isinstance(model_cuda_1_gpu, Sequential)

    model_cuda_1_more_gpu = move_model_to_device(model,
                                                 torch.device("cuda"),
                                                 num_gpus=num_cuda_devices + 1)
    if num_cuda_devices > 1:
        assert isinstance(model_cuda_1_more_gpu, DataParallel)
    else:
        assert isinstance(model_cuda_1_more_gpu, Sequential)

    model_cuda_same_gpu = move_model_to_device(model,
                                               torch.device("cuda"),
                                               num_gpus=num_cuda_devices)
    if num_cuda_devices > 1:
        assert isinstance(model_cuda_same_gpu, DataParallel)
    else:
        assert isinstance(model_cuda_same_gpu, Sequential)

Esempio n. 2

0

Mostra file

    def predict(self,
                eval_dataloader,
                get_inputs,
                num_gpus,
                gpu_ids,
                verbose=True):
        # get device
        device, num_gpus = get_device(num_gpus=num_gpus,
                                      gpu_ids=gpu_ids,
                                      local_rank=-1)

        # move model
        self.model = move_model_to_device(model=self.model, device=device)

        # parallelize model
        self.model = parallelize_model(
            model=self.model,
            device=device,
            num_gpus=num_gpus,
            gpu_ids=gpu_ids,
            local_rank=-1,
        )

        # predict
        self.model.eval()
        for batch in tqdm(eval_dataloader, desc="Scoring",
                          disable=not verbose):
            with torch.no_grad():
                inputs = get_inputs(batch,
                                    device,
                                    self.model_name,
                                    train_mode=False)
                outputs = self.model(**inputs)
                logits = outputs[0]
            yield logits.detach().cpu().numpy()

Esempio n. 3

0

Mostra file

File: abstractive_summarization_bertsum.py Progetto: pemukl/german-bertabs

 def this_model_move_callback(model, device):
     model = move_model_to_device(model, device)
     return parallelize_model(model,
                              device,
                              num_gpus=num_gpus,
                              gpu_ids=gpu_ids,
                              local_rank=local_rank)

Esempio n. 4

0

Mostra file

def test_move_to_device_cpu_parallelized(model):
    # test when input model is parallelized
    model_parallelized = nn.DataParallel(model)
    model_parallelized_output = move_model_to_device(model_parallelized,
                                                     torch.device("cpu"))
    assert isinstance(model_parallelized_output,
                      nn.modules.container.Sequential)

Esempio n. 5

0

Mostra file

    def prepare_model_and_optimizer(
        self,
        num_gpus,
        gpu_ids,
        local_rank,
        weight_decay,
        learning_rate,
        adam_epsilon,
        fp16=False,
        fp16_opt_level="O1",
        checkpoint_state_dict=None,
    ):
        """
        This function initializes an optimizer and moves the model to a device.
        It can be used by most child classes before calling fine_tune.
        Child classes that require custom optimizers need to either override this
            function or implement the steps listed below in the specified order
            before fine-tuning.

        The steps are performed in the following order:
            1. Move model to device
            2. Create optimizer
            3. Initialize amp
            4. Parallelize model
        """

        amp = get_amp(fp16)

        # get device
        device, num_gpus = get_device(num_gpus=num_gpus,
                                      gpu_ids=gpu_ids,
                                      local_rank=local_rank)

        # move model
        self.model = move_model_to_device(model=self.model, device=device)

        # init optimizer
        self.optimizer = Transformer.get_default_optimizer(
            self.model, weight_decay, learning_rate, adam_epsilon)

        if fp16 and amp:
            self.model, self.optimizer = amp.initialize(
                self.model, self.optimizer, opt_level=fp16_opt_level)

        if checkpoint_state_dict:
            self.optimizer.load_state_dict(checkpoint_state_dict["optimizer"])
            self.model.load_state_dict(checkpoint_state_dict["model"])

            if fp16 and amp:
                amp.load_state_dict(checkpoint_state_dict["amp"])

        self.model = parallelize_model(
            model=self.model,
            device=device,
            num_gpus=num_gpus,
            gpu_ids=gpu_ids,
            local_rank=local_rank,
        )

        return device, num_gpus, amp

Esempio n. 6

0

Mostra file

File: sequence_encoding.py Progetto: yhe0802/nlp-recipes

    def get_hidden_states(self, text, batch_size=32):
        """Extract the hidden states from the pretrained model

        Args:
            text: List of documents to extract features from.
            batch_size: Batch size, defaults to 32.

        Returns:
            pd.DataFrame with columns:
                text_index (int), token (str), layer_index (int), values (list[float]).
        """
        device, num_gpus = get_device(self.num_gpus)
        self.model = move_model_to_device(self.model, device, self.num_gpus)

        self.model.eval()

        tokens = self.tokenizer.tokenize(text)

        tokens, input_ids, input_mask, input_type_ids = self.tokenizer.preprocess_encoder_tokens(
            tokens, max_len=self.max_len
        )

        input_ids = torch.tensor(input_ids, dtype=torch.long, device=device)
        input_mask = torch.tensor(input_mask, dtype=torch.long, device=device)
        input_type_ids = torch.arange(input_ids.size(0), dtype=torch.long, device=device)

        eval_data = TensorDataset(input_ids, input_mask, input_type_ids)
        eval_dataloader = DataLoader(eval_data, sampler=SequentialSampler(eval_data), batch_size=batch_size)

        hidden_states = {"text_index": [], "token": [], "layer_index": [], "values": []}
        for (input_ids_tensor, input_mask_tensor, example_indices_tensor) in eval_dataloader:
            with torch.no_grad():
                all_encoder_layers, _ = self.model(
                    input_ids_tensor, token_type_ids=None, attention_mask=input_mask_tensor
                )
                self.embedding_dim = all_encoder_layers[0].size()[-1]

            for b, example_index in enumerate(example_indices_tensor):
                for (i, token) in enumerate(tokens[example_index.item()]):
                    for (j, layer_index) in enumerate(self.layer_index):
                        layer_output = all_encoder_layers[int(layer_index)].detach().cpu().numpy()
                        layer_output = layer_output[b]
                        hidden_states["text_index"].append(example_index.item())
                        hidden_states["token"].append(token)
                        hidden_states["layer_index"].append(layer_index)
                        hidden_states["values"].append([round(x.item(), 6) for x in layer_output[i]])

            # empty cache
            del [input_ids_tensor, input_mask_tensor, example_indices_tensor]
            torch.cuda.empty_cache()

        # empty cache
        del [input_ids, input_mask, input_type_ids]
        torch.cuda.empty_cache()

        return pd.DataFrame.from_dict(hidden_states)

Esempio n. 7

0

Mostra file

def test_move_to_device_exception_not_torch_device(model):
    # test when device is not torch.device
    with pytest.raises(ValueError):
        move_model_to_device(model, "abc")

Esempio n. 8

0

Mostra file

def test_move_to_device_cpu(model):
    # test when device.type="cpu"
    model_cpu = move_model_to_device(model, torch.device("cpu"))
    assert isinstance(model_cpu, nn.modules.container.Sequential)

Esempio n. 9

0

Mostra file

File: sequence_classification_distributed.py Progetto: yhe0802/nlp-recipes

    def fit(
        self,
        train_loader,
        epoch,
        bert_optimizer=None,
        num_epochs=1,
        num_gpus=None,
        lr=2e-5,
        warmup_proportion=None,
        fp16_allreduce=False,
        num_train_optimization_steps=10,
    ):
        """
        Method to fine-tune the bert classifier using the given training data

        Args:
            train_loader(torch.DataLoader): Torch Dataloader created from Torch Dataset
            epoch(int): Current epoch number of training.
            bert_optimizer(optimizer): optimizer can be BERTAdam for local and Dsitributed if Horovod
            num_epochs(int): the number of epochs to run
            num_gpus(int): the number of gpus. If None is specified, all available GPUs will be used.
            lr (float): learning rate of the adam optimizer. defaults to 2e-5.
            warmup_proportion (float, optional): proportion of training to
                perform linear learning rate warmup for. e.g., 0.1 = 10% of
                training. defaults to none.
            fp16_allreduce(bool): if true, use fp16 compression during allreduce
            num_train_optimization_steps: number of steps the optimizer should take.
        """

        device, num_gpus = get_device(num_gpus)

        self.model = move_model_to_device(self.model, device, num_gpus)

        if bert_optimizer is None:
            bert_optimizer = self.create_optimizer(
                num_train_optimization_steps=num_train_optimization_steps,
                lr=lr,
                warmup_proportion=warmup_proportion,
                fp16_allreduce=fp16_allreduce,
            )

        if self.use_distributed:
            hvd.broadcast_parameters(self.model.state_dict(), root_rank=0)

        loss_func = nn.CrossEntropyLoss().to(device)

        # train
        self.model.train()  # training mode

        token_type_ids_batch = None

        num_print = 1000
        for batch_idx, data in enumerate(train_loader):

            x_batch = data["token_ids"]
            x_batch = x_batch.cuda()

            y_batch = data["labels"]
            y_batch = y_batch.cuda()

            mask_batch = data["input_mask"]
            mask_batch = mask_batch.cuda()

            if "token_type_ids" in data and data["token_type_ids"] is not None:
                token_type_ids_batch = data["token_type_ids"]
                token_type_ids_batch = token_type_ids_batch.cuda()

            bert_optimizer.zero_grad()

            y_h = self.model(
                input_ids=x_batch,
                token_type_ids=token_type_ids_batch,
                attention_mask=mask_batch,
                labels=None,
            )

            loss = loss_func(y_h, y_batch).mean()
            loss.backward()

            bert_optimizer.synchronize()
            bert_optimizer.step()

            if batch_idx % num_print == 0:
                print(
                    "Train Epoch: {}/{} ({:.0f}%) \t Batch:{} \tLoss: {:.6f}".
                    format(
                        epoch,
                        num_epochs,
                        100.0 * batch_idx / len(train_loader),
                        batch_idx + 1,
                        loss.item(),
                    ))

        del [x_batch, y_batch, mask_batch, token_type_ids_batch]
        torch.cuda.empty_cache()

Esempio n. 10

0

Mostra file

    def predict(self,
                token_ids,
                input_mask,
                labels=None,
                batch_size=32,
                num_gpus=None,
                probabilities=False):
        """
        Predict token labels on the testing data.

        Args:
            token_ids (list): List of lists. Each sublist contains
                numerical token ids corresponding to the tokens in the input
                text data.
            input_mask (list): List of lists. Each sublist contains
                the attention mask of the input token list, 1 for input
                tokens and 0 for padded tokens, so that padded tokens are
                not attended to.
            labels (list, optional): List of lists. Each sublist contains
                numerical token labels of an input sentence/paragraph.
                If provided, it's used to compute the evaluation loss.
                Default value is None.
            batch_size (int, optional): Testing batch size. Defaults to 32.
            num_gpus (int, optional): The number of GPUs to use.
                If None, all available GPUs will be used. Defaults to None.

        Returns:
            list or namedtuple(list, ndarray): List of lists of predicted
                token labels or ([token labels], probabilities) if
                probabilities is True. The probabilities output is an n x m
                array, where n is the size of the testing data and m is the
                number of tokens in each input sublist. The probability
                values are the softmax probability of the predicted class.
        """
        test_dataloader = create_data_loader(
            input_ids=token_ids,
            input_mask=input_mask,
            label_ids=labels,
            batch_size=batch_size,
            sample_method="sequential",
        )
        device, num_gpus = get_device(num_gpus)

        self.model = move_model_to_device(self.model, device, num_gpus)

        self.model.eval()
        eval_loss = 0
        nb_eval_steps = 0
        for step, batch in enumerate(
                tqdm(test_dataloader, desc="Iteration", mininterval=10)):
            batch = tuple(t.to(device) for t in batch)
            true_label_available = False
            if labels:
                b_input_ids, b_input_mask, b_labels = batch
                true_label_available = True
            else:
                b_input_ids, b_input_mask = batch

            with torch.no_grad():
                logits = self.model(b_input_ids, attention_mask=b_input_mask)
                if true_label_available:
                    active_loss = b_input_mask.view(-1) == 1
                    active_logits = logits.view(-1,
                                                self.num_labels)[active_loss]
                    active_labels = b_labels.view(-1)[active_loss]
                    loss_fct = nn.CrossEntropyLoss()
                    tmp_eval_loss = loss_fct(active_logits, active_labels)

                    eval_loss += tmp_eval_loss.mean().item()

            logits = logits.detach().cpu()

            if step == 0:
                logits_all = logits.numpy()
            else:
                logits_all = np.append(logits_all, logits, axis=0)

            nb_eval_steps += 1

        predictions = [list(p) for p in np.argmax(logits_all, axis=2)]

        if true_label_available:
            validation_loss = eval_loss / nb_eval_steps
            print("Evaluation loss: {}".format(validation_loss))

        if probabilities:
            return namedtuple("Predictions", "classes probabilities")(
                predictions,
                np.max(nn.Softmax(dim=2)(torch.Tensor(logits_all)).numpy(), 2))
        else:
            return predictions

Esempio n. 11

0

Mostra file

def test_move_to_device_exception_cuda_zero_gpus(model):
    # test when device.type is cuda, but num_gpus is 0
    with pytest.raises(ValueError):
        move_model_to_device(model, torch.device("cuda"), num_gpus=0)

Esempio n. 12

0

Mostra file

File: sequence_classification.py Progetto: zini-julia/nlp-recipes

    def fit(
        self,
        token_ids,
        input_mask,
        labels,
        token_type_ids=None,
        num_gpus=None,
        num_epochs=1,
        batch_size=32,
        lr=2e-5,
        warmup_proportion=None,
        verbose=True,
    ):
        """Fine-tunes the BERT classifier using the given training data.

        Args:
            token_ids (list): List of training token id lists.
            input_mask (list): List of input mask lists.
            labels (list): List of training labels.
            token_type_ids (list, optional): List of lists. Each sublist
                contains segment ids indicating if the token belongs to
                the first sentence(0) or second sentence(1). Only needed
                for two-sentence tasks.
            num_gpus (int, optional): The number of gpus to use.
                                      If None is specified, all available GPUs
                                      will be used. Defaults to None.
            num_epochs (int, optional): Number of training epochs.
                Defaults to 1.
            batch_size (int, optional): Training batch size. Defaults to 32.
            lr (float): Learning rate of the Adam optimizer. Defaults to 2e-5.
            warmup_proportion (float, optional): Proportion of training to
                perform linear learning rate warmup for. E.g., 0.1 = 10% of
                training. Defaults to None.
            verbose (bool, optional): If True, shows the training progress and
                loss values. Defaults to True.
        """

        device, num_gpus = get_device(num_gpus)

        self.model = move_model_to_device(self.model, device)
        self.model = parallelize_model(self.model, device, num_gpus=num_gpus)

        token_ids_tensor = torch.tensor(token_ids, dtype=torch.long)
        input_mask_tensor = torch.tensor(input_mask, dtype=torch.long)
        labels_tensor = torch.tensor(labels, dtype=torch.long)

        if token_type_ids:
            token_type_ids_tensor = torch.tensor(token_type_ids, dtype=torch.long)
            train_dataset = TensorDataset(
                token_ids_tensor,
                input_mask_tensor,
                token_type_ids_tensor,
                labels_tensor,
            )
        else:
            train_dataset = TensorDataset(
                token_ids_tensor, input_mask_tensor, labels_tensor
            )
        train_sampler = RandomSampler(train_dataset)

        train_dataloader = DataLoader(
            train_dataset, sampler=train_sampler, batch_size=batch_size
        )
        # define optimizer and model parameters
        param_optimizer = list(self.model.named_parameters())
        no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
        optimizer_grouped_parameters = [
            {
                "params": [
                    p for n, p in param_optimizer if not any(nd in n for nd in no_decay)
                ],
                "weight_decay": 0.01,
            },
            {
                "params": [
                    p for n, p in param_optimizer if any(nd in n for nd in no_decay)
                ],
                "weight_decay": 0.0,
            },
        ]

        num_batches = len(train_dataloader)
        num_train_optimization_steps = num_batches * num_epochs

        if warmup_proportion is None:
            opt = BertAdam(optimizer_grouped_parameters, lr=lr)
        else:
            opt = BertAdam(
                optimizer_grouped_parameters,
                lr=lr,
                t_total=num_train_optimization_steps,
                warmup=warmup_proportion,
            )

        # define loss function
        loss_func = nn.CrossEntropyLoss().to(device)

        # train
        self.model.train()  # training mode

        for epoch in range(num_epochs):
            training_loss = 0
            for i, batch in enumerate(tqdm(train_dataloader, desc="Iteration")):
                if token_type_ids:
                    x_batch, mask_batch, token_type_ids_batch, y_batch = tuple(
                        t.to(device) for t in batch
                    )
                else:
                    token_type_ids_batch = None
                    x_batch, mask_batch, y_batch = tuple(t.to(device) for t in batch)

                opt.zero_grad()

                y_h = self.model(
                    input_ids=x_batch,
                    token_type_ids=token_type_ids_batch,
                    attention_mask=mask_batch,
                    labels=None,
                )
                loss = loss_func(y_h, y_batch).mean()

                training_loss += loss.item()

                loss.backward()
                opt.step()
                if verbose:
                    if i % ((num_batches // 10) + 1) == 0:
                        print(
                            "epoch:{}/{}; batch:{}->{}/{}; avg loss:{:.6f}".format(
                                epoch + 1,
                                num_epochs,
                                i + 1,
                                min(i + 1 + num_batches // 10, num_batches),
                                num_batches,
                                training_loss / (i + 1),
                            )
                        )
        # empty cache
        del [x_batch, y_batch, mask_batch, token_type_ids_batch]
        torch.cuda.empty_cache()

Esempio n. 13

0

Mostra file

File: sequence_classification.py Progetto: zini-julia/nlp-recipes

    def predict(
        self,
        token_ids,
        input_mask,
        token_type_ids=None,
        num_gpus=None,
        batch_size=32,
        probabilities=False,
    ):
        """Scores the given dataset and returns the predicted classes.

        Args:
            token_ids (list): List of training token lists.
            input_mask (list): List of input mask lists.
            token_type_ids (list, optional): List of lists. Each sublist
                contains segment ids indicating if the token belongs to
                the first sentence(0) or second sentence(1). Only needed
                for two-sentence tasks.
            num_gpus (int, optional): The number of gpus to use.
                                      If None is specified, all available GPUs
                                      will be used. Defaults to None.
            batch_size (int, optional): Scoring batch size. Defaults to 32.
            probabilities (bool, optional):
                If True, the predicted probability distribution
                is also returned. Defaults to False.
        Returns:
            1darray, namedtuple(1darray, ndarray): Predicted classes or
                (classes, probabilities) if probabilities is True.
        """
        device, num_gpus = get_device(num_gpus)
        self.model = move_model_to_device(self.model, device)
        self.model = parallelize_model(self.model, device, num_gpus)

        # score
        self.model.eval()

        token_ids_tensor = torch.tensor(token_ids, dtype=torch.long)
        input_mask_tensor = torch.tensor(input_mask, dtype=torch.long)

        if token_type_ids:
            token_type_ids_tensor = torch.tensor(token_type_ids, dtype=torch.long)
            test_dataset = TensorDataset(
                token_ids_tensor, input_mask_tensor, token_type_ids_tensor
            )
        else:
            test_dataset = TensorDataset(token_ids_tensor, input_mask_tensor)

        test_sampler = SequentialSampler(test_dataset)
        test_dataloader = DataLoader(
            test_dataset, sampler=test_sampler, batch_size=batch_size
        )

        preds = []
        for i, batch in enumerate(tqdm(test_dataloader, desc="Iteration")):
            if token_type_ids:
                x_batch, mask_batch, token_type_ids_batch = tuple(
                    t.to(device) for t in batch
                )
            else:
                token_type_ids_batch = None
                x_batch, mask_batch = tuple(t.to(device) for t in batch)

            with torch.no_grad():
                p_batch = self.model(
                    input_ids=x_batch,
                    token_type_ids=token_type_ids_batch,
                    attention_mask=mask_batch,
                    labels=None,
                )
            preds.append(p_batch.cpu())

        preds = np.concatenate(preds)

        if probabilities:
            return namedtuple("Predictions", "classes probabilities")(
                preds.argmax(axis=1), nn.Softmax(dim=1)(torch.Tensor(preds)).numpy()
            )
        else:
            return preds.argmax(axis=1)

Esempio n. 14

0

Mostra file

    def fit(
        self,
        token_ids,
        input_mask,
        labels,
        val_token_ids,
        val_input_mask,
        val_labels,
        token_type_ids=None,
        val_token_type_ids=None,
        verbose=True,
        logging_steps=0,
        save_steps=0,
        val_steps=0,
    ):
        """Fine-tunes the XLNet classifier using the given training data.

        Args:
            token_ids (list): List of training token id lists.
            input_mask (list): List of input mask lists.
            labels (list): List of training labels.
            token_type_ids (list, optional): List of lists. Each sublist
                contains segment ids indicating if the token belongs to
                the first sentence(0) or second sentence(1). Only needed
                for two-sentence tasks.
            verbose (bool, optional): If True, shows the training progress and
                loss values. Defaults to True.
        """

        device, num_gpus = get_device(self.num_gpus)
        self.model = move_model_to_device(self.model, device, self.num_gpus)

        token_ids_tensor = torch.tensor(token_ids, dtype=torch.long)
        input_mask_tensor = torch.tensor(input_mask, dtype=torch.long)
        labels_tensor = torch.tensor(labels, dtype=torch.long)

        val_token_ids_tensor = torch.tensor(val_token_ids, dtype=torch.long)
        val_input_mask_tensor = torch.tensor(val_input_mask, dtype=torch.long)
        val_labels_tensor = torch.tensor(val_labels, dtype=torch.long)

        if token_type_ids:
            token_type_ids_tensor = torch.tensor(token_type_ids,
                                                 dtype=torch.long)
            val_token_type_ids_tensor = torch.tensor(val_token_type_ids,
                                                     dtype=torch.long)

            train_dataset = TensorDataset(token_ids_tensor, input_mask_tensor,
                                          token_type_ids_tensor, labels_tensor)

            val_dataset = TensorDataset(
                val_token_ids_tensor,
                val_input_mask_tensor,
                val_token_type_ids_tensor,
                val_labels_tensor,
            )

        else:

            train_dataset = TensorDataset(token_ids_tensor, input_mask_tensor,
                                          labels_tensor)

            val_dataset = TensorDataset(val_token_ids_tensor,
                                        val_input_mask_tensor,
                                        val_labels_tensor)

        # define optimizer and model parameters
        param_optimizer = list(self.model.named_parameters())
        no_decay = ["bias", "LayerNorm.weight"]
        optimizer_grouped_parameters = [
            {
                "params": [
                    p for n, p in param_optimizer
                    if not any(nd in n for nd in no_decay)
                ],
                "weight_decay":
                self.weight_decay,
            },
            {
                "params": [
                    p for n, p in param_optimizer
                    if any(nd in n for nd in no_decay)
                ],
                "weight_decay":
                0.0
            },
        ]

        val_sampler = RandomSampler(val_dataset)

        val_dataloader = DataLoader(val_dataset,
                                    sampler=val_sampler,
                                    batch_size=self.batch_size)

        num_examples = len(token_ids)
        num_batches = int(np.ceil(num_examples / self.batch_size))
        num_train_optimization_steps = num_batches * self.num_epochs

        optimizer = AdamW(optimizer_grouped_parameters,
                          lr=self.lr,
                          eps=self.adam_eps)
        scheduler = WarmupLinearSchedule(optimizer,
                                         warmup_steps=self.warmup_steps,
                                         t_total=num_train_optimization_steps)

        global_step = 0
        self.model.train()
        optimizer.zero_grad()
        for epoch in range(self.num_epochs):

            train_sampler = RandomSampler(train_dataset)

            train_dataloader = DataLoader(train_dataset,
                                          sampler=train_sampler,
                                          batch_size=self.batch_size)

            tr_loss = 0.0
            logging_loss = 0.0
            val_loss = 0.0

            for i, batch in enumerate(tqdm(train_dataloader,
                                           desc="Iteration")):
                if token_type_ids:
                    x_batch, mask_batch, token_type_ids_batch, y_batch = tuple(
                        t.to(device) for t in batch)
                else:
                    token_type_ids_batch = None
                    x_batch, mask_batch, y_batch = tuple(
                        t.to(device) for t in batch)

                outputs = self.model(
                    input_ids=x_batch,
                    token_type_ids=token_type_ids_batch,
                    attention_mask=mask_batch,
                    labels=y_batch,
                )

                loss = outputs[
                    0]  # model outputs are always tuple in pytorch-transformers

                loss.sum().backward()
                torch.nn.utils.clip_grad_norm_(self.model.parameters(),
                                               self.max_grad_norm)

                tr_loss += loss.sum().item()
                optimizer.step()
                # Update learning rate schedule
                scheduler.step()
                optimizer.zero_grad()
                global_step += 1
                # logging of learning rate and loss
                if logging_steps > 0 and global_step % logging_steps == 0:
                    mlflow.log_metric("learning rate",
                                      scheduler.get_lr()[0],
                                      step=global_step)
                    mlflow.log_metric(
                        "training loss",
                        (tr_loss - logging_loss) /
                        (logging_steps * self.batch_size),
                        step=global_step,
                    )
                    logging_loss = tr_loss
                # model checkpointing
                if save_steps > 0 and global_step % save_steps == 0:
                    checkpoint_dir = os.path.join(os.getcwd(), "checkpoints")
                    if not os.path.isdir(checkpoint_dir):
                        os.makedirs(checkpoint_dir)
                    checkpoint_path = checkpoint_dir + "/" + str(
                        global_step) + ".pth"
                    torch.save(self.model.state_dict(), checkpoint_path)
                    mlflow.log_artifact(checkpoint_path)
                # model validation
                if val_steps > 0 and global_step % val_steps == 0:
                    # run model on validation set
                    self.model.eval()
                    val_loss = 0.0
                    for j, val_batch in enumerate(val_dataloader):
                        if token_type_ids:
                            val_x_batch, val_mask_batch, val_token_type_ids_batch, val_y_batch = tuple(
                                t.to(device) for t in val_batch)
                        else:
                            token_type_ids_batch = None
                            val_x_batch, val_mask_batch, val_y_batch = tuple(
                                t.to(device) for t in val_batch)
                        val_outputs = self.model(
                            input_ids=val_x_batch,
                            token_type_ids=val_token_type_ids_batch,
                            attention_mask=val_mask_batch,
                            labels=val_y_batch,
                        )
                        vloss = val_outputs[0]
                        val_loss += vloss.sum().item()
                    mlflow.log_metric("validation loss",
                                      val_loss / len(val_dataset),
                                      step=global_step)
                    self.model.train()

                if verbose:
                    if i % ((num_batches // 10) + 1) == 0:
                        if val_loss > 0:
                            print(
                                "epoch:{}/{}; batch:{}->{}/{}; average training loss:{:.6f};\
                                 average val loss:{:.6f}".format(
                                    epoch + 1,
                                    self.num_epochs,
                                    i + 1,
                                    min(i + 1 + num_batches // 10,
                                        num_batches),
                                    num_batches,
                                    tr_loss / (i + 1),
                                    val_loss / (j + 1),
                                ))
                        else:
                            print(
                                "epoch:{}/{}; batch:{}->{}/{}; average train loss:{:.6f}"
                                .format(
                                    epoch + 1,
                                    self.num_epochs,
                                    i + 1,
                                    min(i + 1 + num_batches // 10,
                                        num_batches),
                                    num_batches,
                                    tr_loss / (i + 1),
                                ))
        checkpoint_dir = os.path.join(os.getcwd(), "checkpoints")
        if not os.path.isdir(checkpoint_dir):
            os.makedirs(checkpoint_dir)
        checkpoint_path = checkpoint_dir + "/" + "final" + ".pth"
        torch.save(self.model.state_dict(), checkpoint_path)
        mlflow.log_artifact(checkpoint_path)
        # empty cache
        del [x_batch, y_batch, mask_batch, token_type_ids_batch]
        if val_steps > 0:
            del [
                val_x_batch, val_y_batch, val_mask_batch,
                val_token_type_ids_batch
            ]
        torch.cuda.empty_cache()

Esempio n. 15

0

Mostra file

    def predict(
        self,
        token_ids,
        input_mask,
        token_type_ids=None,
        num_gpus=None,
        batch_size=8,
        probabilities=False,
    ):
        """Scores the given dataset and returns the predicted classes.

        Args:
            token_ids (list): List of training token lists.
            input_mask (list): List of input mask lists.
            token_type_ids (list, optional): List of lists. Each sublist
                contains segment ids indicating if the token belongs to
                the first sentence(0) or second sentence(1). Only needed
                for two-sentence tasks.
            num_gpus (int, optional): The number of gpus to use.
                                      If None is specified, all available GPUs
                                      will be used. Defaults to None.
            batch_size (int, optional): Scoring batch size. Defaults to 8.
            probabilities (bool, optional):
                If True, the predicted probability distribution
                is also returned. Defaults to False.
        Returns:
            1darray, namedtuple(1darray, ndarray): Predicted classes or
                (classes, probabilities) if probabilities is True.
        """

        device, num_gpus = get_device(num_gpus)
        self.model = move_model_to_device(self.model, device, num_gpus)

        self.model.eval()
        preds = []

        with tqdm(total=len(token_ids)) as pbar:
            for i in range(0, len(token_ids), batch_size):
                start = i
                end = start + batch_size
                x_batch = torch.tensor(token_ids[start:end],
                                       dtype=torch.long,
                                       device=device)
                mask_batch = torch.tensor(input_mask[start:end],
                                          dtype=torch.long,
                                          device=device)

                token_type_ids_batch = torch.tensor(token_type_ids[start:end],
                                                    dtype=torch.long,
                                                    device=device)

                with torch.no_grad():
                    pred_batch = self.model(
                        input_ids=x_batch,
                        token_type_ids=token_type_ids_batch,
                        attention_mask=mask_batch,
                        labels=None,
                    )
                    preds.append(pred_batch[0].cpu())
                    if i % batch_size == 0:
                        pbar.update(batch_size)

            preds = np.concatenate(preds)

            if probabilities:
                return namedtuple("Predictions", "classes probabilities")(
                    preds.argmax(axis=1),
                    nn.Softmax(dim=1)(torch.Tensor(preds)).numpy())
            else:
                return preds.argmax(axis=1)

Esempio n. 16

0

Mostra file

File: test_common_pytorch_utils.py Progetto: zini-julia/nlp-recipes

def test_move_to_device_cpu(model):
    # test when device.type="cpu"
    model_cpu = move_model_to_device(model, torch.device("cpu"))
    assert isinstance(model_cpu, nn.modules.container.Sequential)
    assert next(model_cpu.parameters()).is_cuda is False

Esempio n. 17

0

Mostra file

File: test_common_pytorch_utils.py Progetto: zini-julia/nlp-recipes

def test_parallelize_model(model):
    # test when device.type="cuda" and move model to all devices
    model_cuda = move_model_to_device(model, torch.device("cuda"))
    model_cuda = parallelize_model(model_cuda, torch.device("cuda"))
    num_cuda_devices = torch.cuda.device_count()
    assert isinstance(model_cuda, DataParallel)

    # test moving model to only one gpu
    model_cuda_1_gpu = move_model_to_device(model, torch.device("cuda"))
    assert next(model_cuda_1_gpu.parameters()).is_cuda is True
    model_cuda_1_gpu = parallelize_model(
        model_cuda_1_gpu, torch.device("cuda"), num_gpus=1
    )
    assert next(model_cuda_1_gpu.parameters()).is_cuda is True

    # test parallelize_model can limit the number of devices
    model_cuda_1_more_gpu = move_model_to_device(model, torch.device("cuda"))
    model_cuda_1_more_gpu = parallelize_model(
        model_cuda_1_more_gpu, torch.device("cuda"), num_gpus=num_cuda_devices + 1
    )
    assert next(model_cuda_1_more_gpu.module.parameters()).is_cuda is True

    # test parallelize_model on the same number of devices
    model_cuda_same_gpu = move_model_to_device(model, torch.device("cuda"))
    model_cuda_same_gpu = parallelize_model(
        model_cuda_same_gpu, torch.device("cuda"), num_gpus=num_cuda_devices
    )
    assert next(model_cuda_same_gpu.module.parameters()).is_cuda is True

    # test parallelize_model with gpu id
    model_base = move_model_to_device(model, torch.device("cuda"))
    # when gpu id is [], gpu id [0] is used
    model_cuda_0_gpu = parallelize_model(model_base, torch.device("cuda"), gpu_ids=[])
    # device has priority ??
    assert next(model_cuda_1_gpu.parameters()).device == torch.device("cuda:0")
    assert next(model_cuda_0_gpu.parameters()).is_cuda is True

    # test parallelize_model with gpu id is [0]
    model_base = move_model_to_device(model, torch.device("cuda"))
    model_cuda_1_gpu = parallelize_model(model_base, torch.device("cuda"), gpu_ids=[0])
    assert next(model_cuda_1_gpu.parameters()).is_cuda is True

    # test parallelize_model with gpu id is [0:num_device]
    model_base = move_model_to_device(model, torch.device("cuda"))
    model_cuda_same_gpu = parallelize_model(
        model_base, torch.device("cuda"), gpu_ids=list(range(num_cuda_devices))
    )
    if num_cuda_devices > 1:
        assert next(model_cuda_same_gpu.module.parameters()).is_cuda is True
    else:
        assert next(model_cuda_same_gpu.parameters()).is_cuda is True

    # test parallelize_model with gpu id is [1: num_devices+3]
    model_base = move_model_to_device(model, torch.device("cuda"))
    model_cuda_same_gpu = parallelize_model(
        model_base,
        torch.device("cuda"),
        gpu_ids=[x + 1 for x in list(range(num_cuda_devices + 2))],
    )
    if num_cuda_devices > 1:
        assert next(model_cuda_same_gpu.module.parameters()).is_cuda is True
    else:
        assert next(model_cuda_same_gpu.parameters()).is_cuda is True

    # when intersection is only 1
    model_base = move_model_to_device(model, torch.device("cuda"))
    gpu_ids = [x + num_cuda_devices - 1 for x in list(range(num_cuda_devices))]
    model_cuda_intersect_1_gpu = parallelize_model(
        model_base, torch.device("cuda"), gpu_ids=gpu_ids
    )
    assert next(model_cuda_intersect_1_gpu.parameters()).device == torch.device(
        "cuda:{}".format(num_cuda_devices - 1)
    )
    assert next(model_cuda_intersect_1_gpu.parameters()).is_cuda is True

    # when threre is no intersection, no change to the model 
    model_base = move_model_to_device(model, torch.device("cuda"))
    model_cuda_intersect_0_gpu = parallelize_model(
        model_base,
        torch.device("cuda"),
        gpu_ids=[x + num_cuda_devices for x in list(range(num_cuda_devices))],
    )
    assert (
        next(model_cuda_intersect_0_gpu.parameters()).device
        == next(model_base.parameters()).device
    )
    assert next(model_cuda_intersect_0_gpu.parameters()).is_cuda is True
    # test device is cpu original model on gpu
    model_base = move_model_to_device(model, torch.device("cuda"))
    model_cuda_cpu = parallelize_model(
        model_base,
        torch.device("cpu"),
        gpu_ids=[x + num_cuda_devices for x in list(range(num_cuda_devices))],
    )
    assert next(model_cuda_cpu.parameters()).is_cuda is True
    # test device is cpu and original model on cpu
    model_base = move_model_to_device(model, torch.device("cpu"))
    model_cuda_cpu = parallelize_model(
        model_base,
        torch.device("cpu"),
        gpu_ids=[x + num_cuda_devices for x in list(range(num_cuda_devices))],
    )
    assert next(model_cuda_cpu.parameters()).is_cuda is False

Esempio n. 18

0

Mostra file

def test_move_to_device_exception_wrong_type(model):
    # test when device.type is not "cuda" or "cpu"
    with pytest.raises(Exception):
        move_model_to_device(model, torch.device("opengl"))

Esempio n. 19

0

Mostra file

def test_move_to_device_exception_gpu_model_on_cpu_machine(model):
    # test when the model is moved to a gpu but it is a cpu machine
    with pytest.raises(Exception):
        move_model_to_device(model, torch.device("cuda"))

Esempio n. 20

0

Mostra file

File: abstractive_summarization_bertsum.py Progetto: pemukl/german-bertabs

    def predict(
        self,
        test_dataset,
        num_gpus=None,
        gpu_ids=None,
        local_rank=-1,
        batch_size=16,
        alpha=0.6,
        beam_size=5,
        min_length=15,
        max_length=150,
        fp16=False,
        verbose=True,
    ):
        """
        Predict the summarization for the input data iterator.

        Args:
            test_dataset (SummarizationDataset): Dataset for which the summary
                to be predicted.
            num_gpus (int, optional): The number of GPUs used in prediction.
                Defaults to 1.
            gpu_ids (list): List of GPU IDs to be used.
                If set to None, the first num_gpus GPUs will be used.
                Defaults to None.
            local_rank (int, optional): Local rank of the device in distributed
                inferencing. Defaults to -1, which means non-distributed inferencing.
            batch_size (int, optional): The number of test examples in each batch.
                Defaults to 16.
            alpha (float, optional): Length penalty. Defaults to 0.6.
            beam_size (int, optional): Beam size of beam search. Defaults to 5.
            min_length (int, optional): Minimum number of tokens in the output sequence.
                Defaults to 15.
            max_length (int, optional):  Maximum number of tokens in output
                sequence. Defaults to 150.
            fp16 (bool, optional): Whether to use half-precision model for prediction.
                Defaults to False.
            verbose (bool, optional): Whether to print out the training log.
                Defaults to True.

        Returns:
            List of strings which are the summaries

        """
        device, num_gpus = get_device(num_gpus=num_gpus,
                                      gpu_ids=gpu_ids,
                                      local_rank=local_rank)

        # move model to devices
        def this_model_move_callback(model, device):
            model = move_model_to_device(model, device)
            return parallelize_model(model,
                                     device,
                                     num_gpus=num_gpus,
                                     gpu_ids=gpu_ids,
                                     local_rank=local_rank)

        if fp16:
            self.model = self.model

        self.model = move_model_to_device(self.model, device)
        self.model.eval()

        predictor = build_predictor(
            self.processor.tokenizer,
            self.processor.symbols,
            self.model,
            alpha=alpha,
            beam_size=beam_size,
            min_length=min_length,
            max_length=max_length,
        )
        predictor = this_model_move_callback(predictor, device)
        self.model = parallelize_model(
            self.model,
            device,
            num_gpus=num_gpus,
            gpu_ids=gpu_ids,
            local_rank=local_rank,
        )

        test_sampler = SequentialSampler(test_dataset)

        def collate_fn(data):
            return self.processor.collate(data,
                                          self.max_pos_length,
                                          device,
                                          train_mode=False)

        test_dataloader = DataLoader(
            test_dataset,
            sampler=test_sampler,
            batch_size=batch_size,
            collate_fn=collate_fn,
        )
        print("dataset length is {}".format(len(test_dataset)))

        def format_summary(translation):
            """ Transforms the output of the `from_batch` function
            into nicely formatted summaries.
            """
            raw_summary = translation
            summary = (raw_summary.replace("[unused0]", "").replace(
                "[unused3]",
                "").replace("[CLS]", "").replace("[SEP]", "").replace(
                    "[PAD]",
                    "").replace("[unused1]", "").replace(r" +", " ").replace(
                        " [unused2] ", ".").replace("[unused2]", "").strip())

            return summary

        def generate_summary_from_tokenid(preds, pred_score):
            batch_size = preds.size()[0]  # batch.batch_size
            translations = []
            for b in range(batch_size):
                if len(preds[b]) < 1:
                    pred_sents = ""
                else:
                    pred_sents = self.processor.tokenizer.convert_ids_to_tokens(
                        [int(n) for n in preds[b] if int(n) != 0])
                    pred_sents = " ".join(pred_sents).replace(" ##", "")
                translations.append(pred_sents)
            return translations

        generated_summaries = []

        for batch in tqdm(test_dataloader,
                          desc="Generating summary",
                          disable=not verbose):
            input = self.processor.get_inputs(batch,
                                              device,
                                              "bert",
                                              train_mode=False)
            translations, scores = predictor(**input)

            translations_text = generate_summary_from_tokenid(
                translations, scores)
            summaries = [format_summary(t) for t in translations_text]
            generated_summaries.extend(summaries)

        # release GPU memories
        # self.model.cpu()
        # torch.cuda.empty_cache()

        return generated_summaries

Esempio n. 21

0

Mostra file

File: abstractive_summarization_seq2seq.py Progetto: zeyefkey/nlp-recipes

    def predict(
        self,
        test_dataset,
        per_gpu_batch_size=4,
        max_tgt_length=64,
        beam_size=1,
        need_score_traces=False,
        length_penalty=0,
        forbid_duplicate_ngrams=True,
        forbid_ignore_word=".",
        s2s_config=S2SConfig(),
        num_gpus=None,
        gpu_ids=None,
        local_rank=-1,
        fp16=False,
        verbose=True,
    ):
        """
        Method for predicting, i.e. generating summaries.
        Args:
            test_dataset (S2SAbsSumDataset): Testing dataset.
            per_gpu_batch_size (int, optional): Number of testing samples in each
                batch per GPU. Defaults to 4.
            max_tgt_length (int, optional): Maximum number of tokens in output
                sequence. Defaults to 64.
            beam_size (int, optional): Beam size of beam search. Defaults to 1.
            need_score_traces (bool, optional): Whether to return score traces of
                beam search. Defaults to False.
            length_penalty (float, optional): Length penalty for beam search.
                Defaults to 0.
            forbid_duplicate_ngrams (bool, optional): Whether to forbid duplicate
                n-grams when generating output. Size of the n-gram is determined by
                `S2SConfig.ngram_size` which defaults to 3. Defaults to True.
            forbid_ignore_word (str, optional): Words to ignore when forbidding
                duplicate ngrams. Multiple words should be separated by "|", for
                example, ".|[X_SEP]". Defaults to ".".
            s2s_config (S2SConfig, optional): Some default decoding settings that
                the users usually don't need to change. Defaults to S2SConfig().
            num_gpus (int, optional): Number of GPUs to use. Ignored if `gpu_ids` is
                provided. Defaults to None and all available GPUs are used.
            gpu_ids (list, optional): List of GPU IDs ot use. Defaults to None and GPUs
                used are determined by num_gpus.
            local_rank (int, optional): Rank of the device in distributed training.
                Defaults to -1 which means non-distributed training.
            fp16 (bool, optional): Whether to use 16-bit mixed precision through Apex.
                Defaults to False.
            verbose(bool, optional): Whether to output predicting log. Defaults to True.

        Returns:
            List or tuple of lists: List of generated summaries. If `need_score_traces`
                is True, also returns the score traces of beam search.

        """

        if need_score_traces and beam_size <= 1:
            raise ValueError(
                "Score trace is only available for beam search with beam size > 1."
            )
        if max_tgt_length >= self.max_seq_length - 2:
            raise ValueError("Maximum tgt length exceeds max seq length - 2.")

        # preprocessing pipeline
        if self._model_type == "roberta":
            is_roberta = True
            no_segment_embedding = True
            vocab = self.tokenizer.encoder
        else:
            is_roberta = False
            no_segment_embedding = False
            vocab = self.tokenizer.vocab

        if not self._model_name.startswith("unilm1.2"):
            if self._model_name.startswith(
                    "unilm-") or self._model_name.startswith("unilm1-"):
                new_segment_ids = True
            else:
                new_segment_ids = False
        else:
            new_segment_ids = False

        cls_token = "<s>" if is_roberta else "[CLS]"
        sep_token = "</s>" if is_roberta else "[SEP]"
        pad_token = "<pad>" if is_roberta else "[PAD]"
        mask_token = "<mask>" if is_roberta else "[MASK]"

        max_src_length = self.max_seq_length - 2 - max_tgt_length
        bi_uni_pipeline = []
        bi_uni_pipeline.append(
            seq2seq_loader.Preprocess4Seq2seqDecoder(
                list(vocab.keys()),
                self.tokenizer.convert_tokens_to_ids,
                self.max_seq_length,
                max_tgt_length=max_tgt_length,
                new_segment_ids=new_segment_ids,
                mode=s2s_config.mode,
                num_qkv=s2s_config.num_qkv,
                s2s_special_token=s2s_config.s2s_special_token,
                s2s_add_segment=s2s_config.s2s_add_segment,
                s2s_share_segment=s2s_config.s2s_share_segment,
                pos_shift=s2s_config.pos_shift,
                cls_token=cls_token,
                sep_token=sep_token,
                pad_token=pad_token,
            ))

        def collate_fn(input_batch):
            buf_id = [x[0] for x in input_batch]
            buf = [x[1][:max_src_length] for x in input_batch]
            max_a_len = max([len(x) for x in buf])
            instances = []
            for instance in [(x, max_a_len) for x in buf]:
                for proc in bi_uni_pipeline:
                    instance = proc(instance)
                instances.append(instance)
            batch = seq2seq_loader.batch_list_to_batch_tensors(instances)

            return (batch, buf_id)

        # prepare decoder
        pair_num_relation = 0
        cls_num_labels = 2
        type_vocab_size = (6 + (1 if s2s_config.s2s_add_segment else 0)
                           if new_segment_ids else 2)
        (
            mask_word_id,
            eos_word_ids,
            sos_word_id,
        ) = self.tokenizer.convert_tokens_to_ids(
            [mask_token, sep_token, sep_token])
        forbid_ignore_set = None
        if forbid_ignore_word:
            w_list = []
            for w in forbid_ignore_word.split("|"):
                if w.startswith("[") and w.endswith("]"):
                    w_list.append(w.upper())
                else:
                    w_list.append(w)
            forbid_ignore_set = set(
                self.tokenizer.convert_tokens_to_ids(w_list))

        if hasattr(self.model, "module"):
            state_dict = self.model.module.state_dict()
        else:
            state_dict = self.model.state_dict()

        model = BertForSeq2SeqDecoder.from_pretrained(
            self._bert_model_name,
            state_dict=state_dict,
            num_labels=cls_num_labels,
            num_rel=pair_num_relation,
            type_vocab_size=type_vocab_size,
            task_idx=3,
            mask_word_id=mask_word_id,
            search_beam_size=beam_size,
            length_penalty=length_penalty,
            eos_id=eos_word_ids,
            sos_id=sos_word_id,
            forbid_duplicate_ngrams=forbid_duplicate_ngrams,
            forbid_ignore_set=forbid_ignore_set,
            ngram_size=s2s_config.forbid_ngram_size,
            min_len=s2s_config.min_len,
            mode=s2s_config.mode,
            max_position_embeddings=self.max_seq_length,
            ffn_type=s2s_config.ffn_type,
            num_qkv=s2s_config.num_qkv,
            seg_emb=s2s_config.seg_emb,
            pos_shift=s2s_config.pos_shift,
            is_roberta=is_roberta,
            no_segment_embedding=no_segment_embedding,
        )

        del state_dict

        if fp16:
            model.half()
        # get device
        device, num_gpus = get_device(num_gpus=num_gpus,
                                      gpu_ids=gpu_ids,
                                      local_rank=local_rank)

        # # move model
        model = move_model_to_device(model=model, device=device)

        batch_size = per_gpu_batch_size * max(1, num_gpus)

        model = parallelize_model(
            model=model,
            device=device,
            num_gpus=num_gpus,
            gpu_ids=gpu_ids,
            local_rank=local_rank,
        )

        # torch.cuda.empty_cache()
        model.eval()
        first_batch = True
        batch_count = 0

        output_lines = [""] * len(test_dataset)
        score_trace_list = [None] * len(test_dataset)

        test_sampler = SequentialSampler(test_dataset)
        test_dataloader = DataLoader(
            test_dataset,
            sampler=test_sampler,
            batch_size=batch_size,
            collate_fn=collate_fn,
        )
        for batch, buf_id in tqdm(test_dataloader,
                                  desc="Evaluating",
                                  disable=not verbose):
            batch_count += 1
            with torch.no_grad():
                batch = [
                    t.to(device) if t is not None else None for t in batch
                ]
                (
                    input_ids,
                    token_type_ids,
                    position_ids,
                    input_mask,
                    mask_qkv,
                    task_idx,
                ) = batch
                traces = model(
                    input_ids,
                    token_type_ids,
                    position_ids,
                    input_mask,
                    task_idx=task_idx,
                    mask_qkv=mask_qkv,
                )
                if beam_size > 1:
                    traces = {k: v.tolist() for k, v in traces.items()}
                    output_ids = traces["pred_seq"]
                else:
                    output_ids = traces.tolist()

                for i in range(len(batch[0])):
                    w_ids = output_ids[i]
                    output_buf = self.tokenizer.convert_ids_to_tokens(w_ids)
                    output_tokens = []
                    for t in output_buf:
                        if t in (sep_token, pad_token):
                            break
                        output_tokens.append(t)
                    if is_roberta:
                        output_sequence = self.tokenizer.convert_tokens_to_string(
                            output_tokens)
                    else:
                        output_sequence = " ".join(detokenize(output_tokens))
                    if "\n" in output_sequence:
                        output_sequence = " [X_SEP] ".join(
                            output_sequence.split("\n"))
                    output_lines[buf_id[i]] = output_sequence
                    if first_batch or batch_count % 50 == 0:
                        logger.info("{} = {}".format(buf_id[i],
                                                     output_sequence))
                    if need_score_traces:
                        score_trace_list[buf_id[i]] = {
                            "scores": traces["scores"][i],
                            "wids": traces["wids"][i],
                            "ptrs": traces["ptrs"][i],
                        }

            first_batch = False

        del model
        del batch
        torch.cuda.empty_cache()

        if need_score_traces:
            return output_lines, score_trace_list
        else:
            return output_lines

Esempio n. 22

0

Mostra file

File: sequence_classification_distributed.py Progetto: yhe0802/nlp-recipes

    def predict(self, test_loader, num_gpus=None, probabilities=False):
        """

        Method to predict the results on the test loader. Only evaluates for non distributed
        workload on the head node in a distributed setup.

        Args:
            test_loader(torch Dataloader): Torch Dataloader created from Torch Dataset
            num_gpus (int, optional): The number of gpus to use.
                                      If None is specified, all available GPUs
                                      will be used. Defaults to None.
            probabilities (bool, optional):
                If True, the predicted probability distribution
                is also returned. Defaults to False.

        Returns:
            1darray, dict(1darray, 1darray, ndarray): Predicted classes and target labels or
                a dictionary with classes, target labels, probabilities) if probabilities is True.
        """
        device, num_gpus = get_device(num_gpus)
        self.model = move_model_to_device(self.model, device, num_gpus)

        # score
        self.model.eval()

        preds = []
        test_labels = []
        for i, data in enumerate(tqdm(test_loader, desc="Iteration")):
            x_batch = data["token_ids"]
            x_batch = x_batch.cuda()

            mask_batch = data["input_mask"]
            mask_batch = mask_batch.cuda()

            y_batch = data["labels"]

            token_type_ids_batch = None
            if "token_type_ids" in data and data["token_type_ids"] is not None:
                token_type_ids_batch = data["token_type_ids"]
                token_type_ids_batch = token_type_ids_batch.cuda()

            with torch.no_grad():
                p_batch = self.model(
                    input_ids=x_batch,
                    token_type_ids=token_type_ids_batch,
                    attention_mask=mask_batch,
                    labels=None,
                )
            preds.append(p_batch.cpu())
            test_labels.append(y_batch)

        preds = np.concatenate(preds)
        test_labels = np.concatenate(test_labels)

        if probabilities:
            return {
                "Predictions":
                preds.argmax(axis=1),
                "Target":
                test_labels,
                "classes probabilities":
                nn.Softmax(dim=1)(torch.Tensor(preds)).numpy(),
            }
        else:
            return preds.argmax(axis=1), test_labels

Esempio n. 23

0

Mostra file

File: abstractive_summarization_bertsum.py Progetto: pemukl/german-bertabs

    def fit(
        self,
        train_dataset,
        num_gpus=None,
        gpu_ids=None,
        batch_size=4,
        local_rank=-1,
        max_steps=5e4,
        warmup_steps_bert=20000,
        warmup_steps_dec=10000,
        learning_rate_bert=0.002,
        learning_rate_dec=0.2,
        optimization_method="adam",
        max_grad_norm=0,
        beta1=0.9,
        beta2=0.999,
        decay_method="noam",
        gradient_accumulation_steps=1,
        report_every=10,
        save_every=1000,
        verbose=True,
        seed=None,
        fp16=False,
        fp16_opt_level="O2",
        world_size=1,
        rank=0,
        validation_function=None,
        checkpoint=None,
        **kwargs,
    ):
        """
        Fine-tune pre-trained transofmer models for extractive summarization.

        Args:
            train_dataset (SummarizationDataset): Training dataset.
            num_gpus (int, optional): The number of GPUs to use. If None, all
                available GPUs will be used. If set to 0 or GPUs are not available,
                CPU device will be used. Defaults to None.
            gpu_ids (list): List of GPU IDs to be used.
                If set to None, the first num_gpus GPUs will be used.
                Defaults to None.
            batch_size (int, optional): Maximum number of tokens in each batch.
            local_rank (int, optional): Local_rank for distributed training on GPUs.
                Local rank means the ranking of the current GPU device on the current
                node. Defaults to -1, which means non-distributed training.
            max_steps (int, optional): Maximum number of training steps. Defaults to 5e5.
            warmup_steps_bert (int, optional): Number of steps taken to increase
                learning rate from 0 to `learning_rate` for tuning the BERT encoder.
                Defaults to 2e4.
            warmup_steps_dec (int, optional): Number of steps taken to increase
                learning rate from 0 to `learning_rate` for tuning the decoder.
                Defaults to 1e4.
            learning_rate_bert (float, optional):  Learning rate of the optimizer
                for the encoder. Defaults to 0.002.
            learning_rate_dec (float, optional):  Learning rate of the optimizer
                for the decoder. Defaults to 0.2.
            optimization_method (string, optional): Optimization method used in fine
                tuning. Defaults to "adam".
            max_grad_norm (float, optional): Maximum gradient norm for gradient clipping.
                Defaults to 0.
            beta1 (float, optional): The exponential decay rate for the first moment
                estimates. Defaults to 0.9.
            beta2 (float, optional): The exponential decay rate for the second-moment
                estimates. This value should be set close to 1.0 on problems with
                a sparse gradient. Defaults to 0.99.
            decay_method (string, optional): learning rate decrease method.
                Default to 'noam'.
            gradient_accumulation_steps (int, optional): Number of batches to accumulate
                gradients on between each model parameter update. Defaults to 1.
            report_every (int, optional): The interval by steps to print out the
                training log. Defaults to 10.
            save_every (int, optional): The interval by steps to save the finetuned 
                model. Defaults to 100.
            verbose (bool, optional): Whether to print out the training log.
                Defaults to True.
            seed (int, optional): Random seed used to improve reproducibility.
                Defaults to None.
            fp16 (bool, optional): Whether to use mixed precision training.
                Defaults to False.
            fp16_opt_level (str, optional): optimization level, refer to
                 https://nvidia.github.io/apex/amp.html#opt-levels for details.
                 Value choices are: "O0", "O1", "O2", "O3". Defaults to "O2".
            world_size (int, optional): Total number of GPUs that will be used.
                Defaults to 1.
            rank (int, optional): Global rank of the current GPU in distributed
                training. It's calculated with the rank of the current node in the
                cluster/world and the `local_rank` of the device in the current node.
                See an example in :file: `examples/text_summarization/
                abstractive_summarization_bertsum_cnndm_distributed_train.py`.
                Defaults to 0.
            validation_function (function, optional): function used in fitting to
                validate the performance. Default to None.
            checkpoint (str, optional): file path for a checkpoint based on which the
                training continues. Default to None.
        """

        # get device
        device, num_gpus = get_device(num_gpus=num_gpus,
                                      gpu_ids=gpu_ids,
                                      local_rank=local_rank)
        # move model to devices
        print("device is {}".format(device))
        if checkpoint:
            checkpoint = torch.load(checkpoint, map_location="cpu")
            self.model.load_checkpoint(checkpoint["model"])
        self.model = move_model_to_device(model=self.model, device=device)

        # init optimizer
        self.optim_bert = model_builder.build_optim_bert(
            self.model,
            optim=optimization_method,
            lr_bert=learning_rate_bert,
            warmup_steps_bert=warmup_steps_bert,
            max_grad_norm=max_grad_norm,
            beta1=beta1,
            beta2=beta2,
        )
        self.optim_dec = model_builder.build_optim_dec(
            self.model,
            optim=optimization_method,
            lr_dec=learning_rate_dec,
            warmup_steps_dec=warmup_steps_dec,
            max_grad_norm=max_grad_norm,
            beta1=beta1,
            beta2=beta2,
        )

        optimizers = [self.optim_bert, self.optim_dec]

        self.amp = get_amp(fp16)
        if self.amp:
            self.model, optim = self.amp.initialize(self.model,
                                                    optimizers,
                                                    opt_level=fp16_opt_level)

        global_step = 0
        if checkpoint:
            if checkpoint["optimizers"]:
                for i in range(len(optimizers)):
                    model_builder.load_optimizer_checkpoint(
                        optimizers[i], checkpoint["optimizers"][i])
            if self.amp and "amp" in checkpoint and checkpoint["amp"]:
                self.amp.load_state_dict(checkpoint["amp"])
            if "global_step" in checkpoint and checkpoint["global_step"]:
                global_step = checkpoint["global_step"] / world_size
                print("global_step is {}".format(global_step))

        self.model = parallelize_model(model=self.model,
                                       device=device,
                                       num_gpus=num_gpus,
                                       gpu_ids=gpu_ids,
                                       local_rank=local_rank,
                                       apex=self.amp)

        if local_rank == -1:
            sampler = RandomSampler(train_dataset)
        else:
            sampler = DistributedSampler(train_dataset,
                                         num_replicas=world_size,
                                         rank=rank)

        def collate_fn(data):
            return self.processor.collate(data,
                                          block_size=self.max_pos_length,
                                          device=device)

        train_dataloader = DataLoader(train_dataset,
                                      sampler=sampler,
                                      batch_size=batch_size,
                                      collate_fn=collate_fn)

        # compute the max number of training steps
        max_steps = compute_training_steps(
            train_dataloader,
            max_steps=max_steps,
            gradient_accumulation_steps=gradient_accumulation_steps,
        )

        super().fine_tune(
            train_dataloader=train_dataloader,
            get_inputs=BertSumAbsProcessor.get_inputs,
            device=device,
            num_gpus=num_gpus,
            max_steps=max_steps,
            global_step=global_step,
            max_grad_norm=max_grad_norm,
            gradient_accumulation_steps=gradient_accumulation_steps,
            verbose=verbose,
            seed=seed,
            report_every=report_every,
            save_every=save_every,
            clip_grad_norm=False,
            optimizer=optimizers,
            scheduler=None,
            fp16=fp16,
            amp=self.amp,
            validation_function=validation_function,
        )

        # release GPU memories
        self.model.cpu()
        torch.cuda.empty_cache()

        self.save_model(max_steps)

Esempio n. 24

0

Mostra file

    def fit(
        self,
        train_dataset,
        num_gpus=None,
        gpu_ids=None,
        batch_size=3000,
        local_rank=-1,
        max_steps=5e5,
        warmup_steps=1e5,
        learning_rate=2e-3,
        optimization_method="adam",
        max_grad_norm=0,
        beta1=0.9,
        beta2=0.999,
        decay_method="noam",
        gradient_accumulation_steps=1,
        report_every=50,
        verbose=True,
        seed=None,
        save_every=-1,
        world_size=1,
        rank=0,
        use_preprocessed_data=False,
        **kwargs,
    ):
        """
        Fine-tune pre-trained transofmer models for extractive summarization.

        Args:
            train_dataset (ExtSumProcessedIterableDataset): Training dataset.
            num_gpus (int, optional): The number of GPUs to use.
                If None, all available GPUs will be used. If set to 0 or GPUs are not
                available, CPU device will be used. Defaults to None.
            gpu_ids (list): List of GPU IDs to be used.
                If set to None, the first num_gpus GPUs will be used.
                Defaults to None.
            batch_size (int, optional): Maximum number of tokens in each batch.
            local_rank (int, optional): Local_rank for distributed training on GPUs.
                Defaults to -1, which means non-distributed training.
            max_steps (int, optional): Maximum number of training steps.
                Defaults to 5e5.
            warmup_steps (int, optional): Number of steps taken to increase learning
                rate from 0 to `learning_rate`. Defaults to 1e5.
            learning_rate (float, optional):  Learning rate of the AdamW optimizer.
                Defaults to 5e-5.
            optimization_method (string, optional): Optimization method used in
                fine tuning.
            max_grad_norm (float, optional): Maximum gradient norm for gradient
                clipping.
                Defaults to 0.
            gradient_accumulation_steps (int, optional): Number of batches to accumulate
                gradients on between each model parameter update. Defaults to 1.
            decay_method (string, optional): learning rate decrease method.
                Defaulta to 'noam'.
            report_every (int, optional): The interval by steps to print out the
                trainint log.
                Defaults to 50.
            beta1 (float, optional): The exponential decay rate for the first moment
                estimates.
                Defaults to 0.9.
            beta2 (float, optional): The exponential decay rate for the second-moment
                estimates.
                This value should be set close to 1.0 on problems with a sparse
                gradient.
                Defaults to 0.99.
            verbose (bool, optional): Whether to print out the training log.
                Defaults to True.
            seed (int, optional): Random seed used to improve reproducibility.
                Defaults to None.
            rank (int, optional): Global rank of the current GPU in distributed
                training. It's calculated with the rank of the current node in
                the cluster/world and the `local_rank` of the device in the current
                node. See an example in :file: `examples/text_summarization/
                extractive_summarization_cnndm_distributed_train.py`.
                Defaults to 0.
        """

        # get device
        device, num_gpus = get_device(num_gpus=num_gpus,
                                      gpu_ids=gpu_ids,
                                      local_rank=local_rank)
        # move model
        self.model = move_model_to_device(model=self.model, device=device)

        # init optimizer
        optimizer = model_builder.build_optim(
            self.model,
            optimization_method,
            learning_rate,
            max_grad_norm,
            beta1,
            beta2,
            decay_method,
            warmup_steps,
        )
        self.model = parallelize_model(
            model=self.model,
            device=device,
            num_gpus=num_gpus,
            gpu_ids=gpu_ids,
            local_rank=local_rank,
        )

        # batch_size is the number of tokens in a batch
        if use_preprocessed_data:
            train_dataloader = get_dataloader(
                train_dataset.get_stream(),
                is_labeled=True,
                batch_size=batch_size,
                world_size=world_size,
                rank=rank,
                local_rank=local_rank,
            )
        else:
            if local_rank == -1:
                sampler = RandomSampler(train_dataset)
            else:
                sampler = DistributedSampler(train_dataset,
                                             num_replicas=world_size,
                                             rank=rank)

            def collate_fn(data):
                return self.processor.collate(data,
                                              block_size=self.max_pos_length,
                                              device=device)

            train_dataloader = DataLoader(
                train_dataset,
                sampler=sampler,
                batch_size=batch_size,
                collate_fn=collate_fn,
            )

        # compute the max number of training steps
        max_steps = compute_training_steps(
            train_dataloader,
            max_steps=max_steps,
            gradient_accumulation_steps=gradient_accumulation_steps,
        )

        super().fine_tune(
            train_dataloader=train_dataloader,
            get_inputs=ExtSumProcessor.get_inputs,
            device=device,
            num_gpus=num_gpus,
            max_steps=max_steps,
            max_grad_norm=max_grad_norm,
            gradient_accumulation_steps=gradient_accumulation_steps,
            optimizer=optimizer,
            scheduler=None,
            verbose=verbose,
            seed=seed,
            report_every=report_every,
            clip_grad_norm=False,
            save_every=save_every,
        )

Esempio n. 25

0

Mostra file

File: common.py Progetto: yhe0802/nlp-recipes

    def fine_tune(
        self,
        train_dataloader,
        get_inputs,
        num_gpus=None,
        gpu_ids=None,
        max_steps=-1,
        max_grad_norm=1.0,
        gradient_accumulation_steps=1,
        optimizer=None,
        scheduler=None,
        fp16=False,
        fp16_opt_level="O1",
        local_rank=-1,
        verbose=True,
        seed=None,
        report_every=10,
        clip_grad_norm=True,
    ):

        # get device
        device, num_gpus = get_device(num_gpus=num_gpus, local_rank=local_rank)

        if seed is not None:
            Transformer.set_seed(seed, num_gpus > 0)

        if fp16:
            try:
                from apex import amp
            except ImportError:
                raise ImportError("Please install apex from https://www.github.com/nvidia/apex")
            self.model, optimizer = amp.initialize(self.model, optimizer, opt_level=fp16_opt_level)

        # move model
        self.model = move_model_to_device(self.model, device, num_gpus, gpu_ids, local_rank)

        # init training
        global_step = 0
        tr_loss = 0.0
        accum_loss = 0
        self.model.train()
        self.model.zero_grad()

        # train
        start = time.time()
        while global_step < max_steps:
            epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=local_rank not in [-1, 0] or not verbose)
            for step, batch in enumerate(epoch_iterator):
                inputs = get_inputs(batch, device, self.model_name)
                outputs = self.model(**inputs)
                loss = outputs[0]

                if num_gpus > 1:
                    loss = loss.mean()
                if gradient_accumulation_steps > 1:
                    loss = loss / gradient_accumulation_steps

                if fp16:
                    with amp.scale_loss(loss, optimizer) as scaled_loss:
                        scaled_loss.backward()
                else:
                    loss.backward()

                tr_loss += loss.item()
                accum_loss += loss.item()

                if (step + 1) % gradient_accumulation_steps == 0:
                    global_step += 1

                    if clip_grad_norm:
                        if fp16:
                            torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), max_grad_norm)
                        else:
                            torch.nn.utils.clip_grad_norm_(self.model.parameters(), max_grad_norm)

                    if global_step % report_every == 0 and verbose:
                        end = time.time()
                        print(
                            "loss:{0:.6f}, time:{1:f}, examples:{2:.0f}, step:{3:.0f}/{4:.0f}".format(
                                accum_loss / report_every, end - start, len(batch), global_step, max_steps,
                            )
                        )
                        accum_loss = 0
                        start = end

                    optimizer.step()
                    if scheduler:
                        scheduler.step()
                    self.model.zero_grad()

                if global_step > max_steps:
                    epoch_iterator.close()
                    break

        return global_step, tr_loss / global_step

Esempio n. 26

0

Mostra file

    def fit(
        self,
        token_ids,
        input_mask,
        labels,
        num_gpus=None,
        num_epochs=1,
        batch_size=32,
        learning_rate=2e-5,
        warmup_proportion=None,
    ):
        """
        Fine-tunes the BERT classifier using the given training data.

        Args:
            token_ids (list): List of lists. Each sublist contains
                numerical token ids corresponding to the tokens in the input
                text data.
            input_mask (list): List of lists. Each sublist contains
                the attention mask of the input token id list. 1 for input
                tokens and 0 for padded tokens, so that padded tokens are
                not attended to.
            labels (list): List of lists, each sublist contains numerical
                token labels of an input sentence/paragraph.
            num_gpus (int, optional): The number of GPUs to use.
                If None, all available GPUs will be used. Defaults to None.
            num_epochs (int, optional): Number of training epochs.
                Defaults to 1.
            batch_size (int, optional): Training batch size. Defaults to 32.
            learning_rate (float, optional): learning rate of the BertAdam
                optimizer. Defaults to 2e-5.
            warmup_proportion (float, optional): Proportion of training to
                perform linear learning rate warmup for. E.g., 0.1 = 10% of
                training. Defaults to None.
        """

        train_dataloader = create_data_loader(
            input_ids=token_ids,
            input_mask=input_mask,
            label_ids=labels,
            sample_method="random",
            batch_size=batch_size,
        )

        device, num_gpus = get_device(num_gpus)

        self.model = move_model_to_device(self.model, device, num_gpus)

        if num_gpus is None:
            num_gpus_used = torch.cuda.device_count()
        else:
            num_gpus_used = min(num_gpus, torch.cuda.device_count())

        num_train_optimization_steps = max(
            (int(len(token_ids) / batch_size) * num_epochs), 1)
        optimizer = self._get_optimizer(
            learning_rate=learning_rate,
            num_train_optimization_steps=num_train_optimization_steps,
            warmup_proportion=warmup_proportion,
        )

        self.model.train()
        for _ in trange(int(num_epochs), desc="Epoch"):
            tr_loss = 0
            nb_tr_steps = 0
            for step, batch in enumerate(
                    tqdm(train_dataloader, desc="Iteration", mininterval=30)):
                batch = tuple(t.to(device) for t in batch)
                b_token_ids, b_input_mask, b_label_ids = batch

                loss = self.model(input_ids=b_token_ids,
                                  attention_mask=b_input_mask,
                                  labels=b_label_ids)

                if num_gpus_used > 1:
                    # mean() to average on multi-gpu.
                    loss = loss.mean()
                # Accumulate parameter gradients
                loss.backward()

                tr_loss += loss.item()
                nb_tr_steps += 1

                # Update parameters based on current gradients
                optimizer.step()
                # Reset parameter gradients to zero
                optimizer.zero_grad()

            train_loss = tr_loss / nb_tr_steps
            print("Train loss: {}".format(train_loss))

            torch.cuda.empty_cache()