Python BERTLM Examples

Programming Language: Python

Namespace/Package Name: model

Class/Type: BERTLM

Examples at hotexamples.com: 12

Python BERTLM - 12 examples found. These are the top rated real world Python examples of model.BERTLM extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

parameters(7)

forward(7)

BERTLM(5)

eval(3)

state_dict(2)

train(2)

cpu(1)

load_state_dict(1)

modules(1)

to(1)

Example #1

Show file

File: pretrain.py Project: seiichiinoue/bert-pytorch

    def __init__(self,
                 bert: BERT,
                 config: BERTConfig,
                 train_dataloader: DataLoader,
                 test_dataloader: DataLoader = None,
                 lr: float = 1e-4,
                 betas=(0.9, 0.999),
                 weight_decay: float = 0.01,
                 warmup_steps=10000,
                 with_cuda: bool = True,
                 cuda_devices=None,
                 log_freq: int = 10):
        """
        :param bert: BERT class
        :param config: BERT config class
        :param train_dataloader: train dataset data loader
        :param test_dataloader: test dataset data loader [can be None]
        :param lr: learning rate of optimizer
        :param betas: Adam optimizer betas
        :param weight_decay: Adam optimizer weight decay param
        :param with_cuda: traning with cuda
        :param log_freq: logging frequency of the batch iteration
        """

        # Setup cuda device for BERT training, argument -c, --cuda should be true
        cuda_condition = torch.cuda.is_available() and with_cuda
        self.device = torch.device("cuda:0" if cuda_condition else "cpu")

        # This BERT model will be saved every epoch
        self.model = BERTLM(bert, config).to(self.device)

        # Initialize the BERT Language Model, with BERT model
        self.bert = bert

        # Distributed GPU training if CUDA can detect more than 1 GPU
        if with_cuda and torch.cuda.device_count() > 1:
            print("Using %d GPUS for BERT" % torch.cuda.device_count())
            self.model = nn.DataParallel(self.model, device_ids=cuda_devices)

        # Setting the train and test data loader
        self.train_data = train_dataloader
        self.test_data = test_dataloader

        # Setting the Adam optimizer with hyper-param
        self.optim = Adam(self.model.parameters(),
                          lr=lr,
                          betas=betas,
                          weight_decay=weight_decay)
        self.optim_schedule = ScheduledOptim(self.optim,
                                             self.bert.hidden,
                                             n_warmup_steps=warmup_steps)

        # Using Negative Log Likelihood Loss function for predicting the masked_token
        self.criterion = nn.NLLLoss(ignore_index=0)

        self.log_freq = log_freq

        print("Total Parameters:",
              sum([p.nelement() for p in self.model.parameters()]))

Example #2

Show file

File: pretrain.py Project: shashank-m/Mask-Language-Model

    def __init__(self, bert: BERT, vocab_size: int,
                 train_dataloader: DataLoader, test_dataloader: DataLoader = None,
                 with_cuda: bool = True, cuda_devices=None, log_freq: int = hp.log_freq, args=None, global_step=0, path=None):
        """
        :param bert: MLM model which you want to train
        :param vocab_size: total word vocab size
        :param train_dataloader: train dataset data loader
        :param test_dataloader: test dataset data loader [can be None]
        :param lr: learning rate of optimizer
        :param betas: Adam optimizer betas
        :param weight_decay: Adam optimizer weight decay param
        :param with_cuda: traning with cuda
        :param log_freq: logging frequency of the batch iteration
        """

        self.args = args
        self.step = global_step
        self.path = path

        # Setup cuda device for BERT training, argument -c, --cuda should be true
        cuda_condition = torch.cuda.is_available() and with_cuda
        self.device = torch.device("cuda:0,1,2,3" if cuda_condition else "cpu")

        # This BERT model will be saved every epoch
        self.bert = bert
        # Initialize the BERT Language Model, with BERT model
        self.model = BERTLM(bert, vocab_size).to(self.device)

        # Distributed GPU training if CUDA can detect more than 1 GPU
        if with_cuda and torch.cuda.device_count() > 1:
            print("Using %d GPUS for BERT" % torch.cuda.device_count())
            self.model = nn.DataParallel(self.model, device_ids=cuda_devices)

        # Setting the train and test data loader
        self.train_data = train_dataloader
        self.test_data = test_dataloader

        # Setting the Adam optimizer with hyper-param
        total_steps = hp.epochs * len(self.train_data)
        self.optimer = optim4GPU(self.model, total_steps)

        # Using Negative Log Likelihood Loss function for predicting the masked_token
        self.criterion = nn.NLLLoss(ignore_index=0)

        # Writer
        self.log_freq = log_freq
        # train
        self.train_loss_writer = SummaryWriter(f'{self.path.runs_path}/train/train_loss')
        self.train_attn_layer_writer = SummaryWriter(f'{self.path.runs_path}/train/attn_layer')
        self.train_model_param_writer = SummaryWriter(f'{self.path.runs_path}/train/model_param')
        # valid
        self.valid_loss_writer = SummaryWriter(f'{self.path.runs_path}/valid/valid_loss')
        self.valid_attn_layer_writer = SummaryWriter(f'{self.path.runs_path}/valid/valid_attn_layer')

        self.num_params()

Example #3

Show file

File: pretrain.py Project: dery-hit/BERT-pytorch

    def __init__(self, bert: BERT, vocab_size, train_dataloader, test_dataloader=None):
        self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

        self.bert = bert
        self.model = BERTLM(bert, vocab_size).to(self.device)

        if torch.cuda.device_count() > 1:
            print("Using %d GPUS for BERT" % torch.cuda.device_count())
            self.model = nn.DataParallel(self.model)

        self.train_data = train_dataloader
        self.test_data = test_dataloader

        self.optim = Adam(self.model.parameters(), lr=1e-4, betas=(0.9, 0.999), weight_decay=0.01)
        self.criterion = nn.NLLLoss(ignore_index=0)

Example #4

Show file

File: trainer.py Project: cutoutsy/GameBERT

    def __init__(self, bert: BERT, vocab_size: int, epochs: int,
                 tensorboard_log_dir: str, output_path: str,
                 train_dataloader: DataLoader,
                 lr: float = 1e-4, betas=(0.9, 0.999), weight_decay: float = 0.01, warmup_steps=10000,
                 with_cuda: bool = True, log_freq: int = 10, save_steps: int = -1):

        super(BERTTrainer, self).__init__(bert=bert, epochs=epochs,
                                               tensorboard_log_dir=tensorboard_log_dir,
                                               output_path=output_path,
                                               train_dataloader=train_dataloader,
                                               with_cuda=with_cuda, log_freq=log_freq, save_steps=save_steps)

        self.model = BERTLM(bert, vocab_size).to(self.device)

        if with_cuda and torch.cuda.device_count() > 1:
            print("Using %d GPUS for BERT" % torch.cuda.device_count())
            self.model = nn.DataParallel(self.model)

        # Setting the Adam optimizer with hyper-param
        self.optim = Adam(self.model.parameters(), lr=lr, betas=betas, weight_decay=weight_decay)
        self.optim_schedule = ScheduledOptim(self.optim, self.bert.hidden, n_warmup_steps=warmup_steps)
        print("Total Parameters:", sum([p.nelement() for p in self.model.parameters()]))

Example #5

Show file

File: pretrain.py Project: AshleyZG/BERT-pytorch

class BERTTrainer:
    """
    BERTTrainer make the pretrained BERT model with two LM training method.

        1. Masked Language Model : 3.3.1 Task #1: Masked LM
        2. Next Sentence prediction : 3.3.2 Task #2: Next Sentence Prediction

    please check the details on README.md with simple example.

    """
    def __init__(self,
                 bert: BERT,
                 vocab_size: int,
                 train_dataloader: DataLoader,
                 test_dataloader: DataLoader = None,
                 lr: float = 1e-4,
                 betas=(0.9, 0.999),
                 weight_decay: float = 0.01,
                 warmup_steps=10000,
                 with_cuda: bool = True,
                 cuda_devices=None,
                 log_freq: int = 10,
                 pad_index=0):
        """
        :param bert: BERT model which you want to train
        :param vocab_size: total word vocab size
        :param train_dataloader: train dataset data loader
        :param test_dataloader: test dataset data loader [can be None]
        :param lr: learning rate of optimizer
        :param betas: Adam optimizer betas
        :param weight_decay: Adam optimizer weight decay param
        :param with_cuda: traning with cuda
        :param log_freq: logging frequency of the batch iteration
        """

        # Setup cuda device for BERT training, argument -c, --cuda should be true
        cuda_condition = torch.cuda.is_available() and with_cuda
        self.device = torch.device("cuda:0" if cuda_condition else "cpu")

        # This BERT model will be saved every epoch
        self.bert = bert
        # Initialize the BERT Language Model, with BERT model
        self.model = BERTLM(bert, vocab_size).to(self.device)

        # Distributed GPU training if CUDA can detect more than 1 GPU
        if with_cuda and torch.cuda.device_count() > 1:
            print("Using %d GPUS for BERT" % torch.cuda.device_count())
            self.model = nn.DataParallel(self.model, device_ids=cuda_devices)

        # Setting the train and test data loader
        self.train_data = train_dataloader
        self.test_data = test_dataloader
        self.pad_index = pad_index
        # Setting the Adam optimizer with hyper-param
        # self.optim = Adam(self.model.parameters(), lr=lr,
        #                   betas=betas, weight_decay=weight_decay)
        # self.optim_schedule = ScheduledOptim(
        #     self.optim, self.bert.hidden, n_warmup_steps=warmup_steps)
        self.optim = SGD(self.model.parameters(), lr=lr, momentum=0.9)
        # Using Negative Log Likelihood Loss function for predicting the masked_token
        self.criterion = nn.NLLLoss(ignore_index=self.pad_index)

        self.log_freq = log_freq

        print("Total Parameters:",
              sum([p.nelement() for p in self.model.parameters()]))

    def train(self, epoch):
        self.model.train()
        return self.iteration(epoch, self.train_data)

    def test(self, epoch):
        self.model.eval()
        return self.iteration(epoch, self.test_data, train=False)

    def iteration(self, epoch, data_loader, train=True):
        """
        loop over the data_loader for training or testing
        if on train status, backward operation is activated
        and also auto save the model every peoch

        :param epoch: current epoch index
        :param data_loader: torch.utils.data.DataLoader for iteration
        :param train: boolean value of is train or test
        :return: None
        """
        # pdb.set_trace()
        str_code = "train" if train else "test"

        # Setting the tqdm progress bar
        data_iter = tqdm.tqdm(enumerate(data_loader),
                              desc="EP_%s:%d" % (str_code, epoch),
                              total=len(data_loader),
                              bar_format="{l_bar}{r_bar}")

        avg_loss = 0.0
        total_correct = 0
        total_element = 0

        def calculate_iter(data):
            next_sent_output, mask_lm_output = self.model.forward(
                data["bert_input"], data["segment_label"], data["adj_mat"],
                train)
            mask_loss = self.criterion(mask_lm_output.transpose(1, 2),
                                       data["bert_label"])
            loss = mask_loss
            return loss

        for i, data in data_iter:
            # 0. batch_data will be sent into the device(GPU or cpu)
            # pdb.set_trace()
            data = data[0]
            data = {key: value.to(self.device) for key, value in data.items()}

            if train:
                loss = calculate_iter(data)
            else:
                with torch.no_grad():
                    loss = calculate_iter(data)
            # 1. forward the next_sentence_prediction and masked_lm model
            # next_sent_output, mask_lm_output = self.model.forward(
            #     data["bert_input"], data["segment_label"], data["adj_mat"], train)
            # # pdb.set_trace()
            # # 2-1. NLL(negative log likelihood) loss of is_next classification result
            # # next_loss = self.criterion(next_sent_output, data["is_next"])

            # # 2-2. NLLLoss of predicting masked token word
            # mask_loss = self.criterion(
            #     mask_lm_output.transpose(1, 2), data["bert_label"])
            # # pdb.set_trace()
            # # 2-3. Adding next_loss and mask_loss : 3.4 Pre-training Procedure
            # # loss = next_loss + mask_loss
            # loss = mask_loss

            # 3. backward and optimization only in train
            if train:
                self.optim.zero_grad()
                loss.backward()
                # self.optim.step_and_update_lr()
                self.optim.step()
            # pdb.set_trace()
            # mlm prediction accuracy
            # correct = next_sent_output.argmax(
            #     dim=-1).eq(data["is_next"]).sum().item()
            correct = 0
            elements = 0
            for labels, t_labels in zip(mask_lm_output.argmax(dim=-1),
                                        data["bert_label"]):
                correct += sum([
                    1 if l == t and t != self.pad_index else 0
                    for l, t in zip(labels, t_labels)
                ])
                elements += sum([1 for t in t_labels if t != self.pad_index])
            # next sentence prediction accuracy
            # correct = next_sent_output.argmax(
            #     dim=-1).eq(data["is_next"]).sum().item()
            avg_loss += loss.item()
            total_correct += correct
            # total_element += data["is_next"].nelement()
            total_element += elements

            post_fix = {
                "epoch": epoch,
                "iter": i,
                "avg_loss": avg_loss / (i + 1),
                "avg_acc": total_correct / total_element * 100,
                "loss": loss.item()
            }

            if i % self.log_freq == 0 and i != 0:
                data_iter.write(str(post_fix))

        print("EP%d_%s, avg_loss=" % (epoch, str_code),
              avg_loss / len(data_iter), "total_acc=",
              total_correct * 100.0 / total_element)
        return avg_loss / len(data_iter)

    def save(self, epoch, file_path="output/bert_trained.model"):
        """
        Saving the current BERT model on file_path

        :param epoch: current epoch number
        :param file_path: model output path which gonna be file_path+"ep%d" % epoch
        :return: final_output_path
        """
        # output_path = file_path + ".ep%d" % epoch
        # torch.save(self.bert.cpu(), output_path)
        # self.bert.to(self.device)
        # print("EP:%d Model Saved on:" % epoch, output_path)
        # return output_path

        output_path = file_path  # + ".ep%d" % epoch
        # if self.updated:
        #     return output_path
        # torch.save(self.bert.cpu(), output_path)
        torch.save(
            {
                'epoch': epoch,
                'model_state_dict': self.model.state_dict()
                # 'optimizer_state_dict': optimizer.state_dict(),
                # 'loss': loss,
                # ...
            },
            output_path)
        # self.bert.to(self.device)
        print("EP:%d Model Saved on:" % epoch, output_path)
        # self.updated = True
        return output_path

Example #6

Show file

File: pretrain.py Project: SciEvan/Mask-Language-Model-ZN

class BERTTrainer:
    """
    BERTTrainer make the pretrained BERT model with two LM training method.

        1. Masked Language Model : 3.3.1 Task #1: Masked LM

    please check the details on README.md with simple example.

    """
    def __init__(self,
                 bert: BERT,
                 vocab_size: int,
                 model: BERTLM,
                 train_dataloader: DataLoader,
                 test_dataloader: DataLoader = None,
                 with_cuda: bool = True,
                 cuda_devices=None,
                 log_freq: int = hp.log_freq,
                 args=None,
                 global_step=0,
                 path=None):
        """
        :param bert: MLM model which you want to train
        :param vocab_size: total word vocab size
        :param train_dataloader: train dataset data loader
        :param test_dataloader: test dataset data loader [can be None]
        :param lr: learning rate of optimizer
        :param betas: Adam optimizer betas
        :param weight_decay: Adam optimizer weight decay param
        :param with_cuda: traning with cuda
        :param log_freq: logging frequency of the batch iteration
        """

        self.args = args
        self.step = global_step
        self.path = path

        # Setup cuda device for BERT training, argument -c, --cuda should be true
        cuda_condition = torch.cuda.is_available() and with_cuda
        self.device = torch.device("cuda:0,1,2,3" if cuda_condition else "cpu")

        # This BERT model will be saved every epoch
        self.bert = bert
        # Initialize the BERT Language Model, with BERT model
        if model is None:
            self.model = BERTLM(bert, vocab_size).to(self.device)
        else:
            self.model = model
        #self.model = BERTLM(bert, vocab_size).to(self.device)
        #self.model = torch.load('./output/model_mlm/mlm_ep2.model')
        #self.model = BertForSA(bert).to(self.device)
        # Distributed GPU training if CUDA can detect more than 1 GPU
        if with_cuda and torch.cuda.device_count() > 1:
            print("Using %d GPUS for BERT" % torch.cuda.device_count())
            self.model = nn.DataParallel(self.model, device_ids=cuda_devices)

        # Setting the train and test data loader
        self.train_data = train_dataloader
        self.test_data = test_dataloader

        # Setting the Adam optimizer with hyper-param
        total_steps = hp.epochs * len(self.train_data)
        self.optimer = optim4GPU(self.model, total_steps)

        # Using Negative Log Likelihood Loss function for predicting the masked_token
        self.criterion = nn.NLLLoss(ignore_index=0)

        # Writer
        self.log_freq = log_freq
        # train
        self.train_loss_writer = SummaryWriter(
            f'{self.path.runs_path}/train/train_loss')
        self.train_attn_layer_writer = SummaryWriter(
            f'{self.path.runs_path}/train/attn_layer')
        self.train_model_param_writer = SummaryWriter(
            f'{self.path.runs_path}/train/model_param')
        # valid
        self.valid_loss_writer = SummaryWriter(
            f'{self.path.runs_path}/valid/valid_loss')
        self.valid_attn_layer_writer = SummaryWriter(
            f'{self.path.runs_path}/valid/valid_attn_layer')

        self.num_params()

    def train(self):

        train_writer = (self.train_loss_writer, self.train_attn_layer_writer,
                        self.train_model_param_writer)
        valid_writer = (self.valid_loss_writer, self.valid_attn_layer_writer)
        try:
            for epoch in range(hp.epochs):

                # Setting the tqdm progress bar
                data_iter = tqdm.tqdm(enumerate(self.train_data),
                                      desc="EP_%s:%d" % ("train", epoch),
                                      total=len(self.train_data),
                                      bar_format="{l_bar}{r_bar}")

                running_loss = 0
                for i, data in data_iter:

                    self.step += 1

                    # 0. batch_data will be sent into the device(GPU or cpu)
                    data = {
                        key: value.to(self.device)
                        for key, value in data.items()
                    }

                    # 1. forward masked_lm model
                    mask_lm_output, attn_list = self.model.forward(
                        data["mlm_input"], data["input_position"])

                    # 2. NLLLoss of predicting masked token word
                    self.optimer.zero_grad()
                    loss = self.criterion(mask_lm_output.transpose(1, 2),
                                          data["mlm_label"])

                    # 3. backward and optimization only in train
                    loss.backward()
                    self.optimer.step()

                    # loss
                    running_loss += loss.item()
                    avg_loss = running_loss / (i + 1)

                    # write log
                    post_fix = {
                        "epoch": epoch,
                        "iter": i,
                        "step": self.step,
                        "avg_loss": avg_loss,
                        "loss": loss.item()
                    }
                    if i % self.log_freq == 0:
                        data_iter.write(str(post_fix))

                    # writer train loss
                    if self.step % hp.save_train_loss == 0:
                        self.train_loss_writer.add_scalar(
                            'train_loss', loss, self.step)

                    # writer
                    if self.step % hp.save_runs == 0 and data["mlm_input"].size(
                            0) == hp.batch_size:  # 不足batch数量则不采样

                        # writer attns_layer
                        for layer, prob in enumerate(attn_list):
                            prob = prob[0]
                            fig, axs = plt.subplots(1, 4, figsize=(20, 10))
                            print("Layer", layer + 1)
                            for h in range(hp.attn_heads):
                                # a = self.model.bert.layers[layer].multihead.attention[0][h].data
                                self.draw(prob[h].cpu().detach().numpy(), [],
                                          [],
                                          ax=axs[h])
                            plt.savefig(
                                f"{self.path.plt_train_attn_path}/Epoch{epoch}_train_step{self.step}_layer{layer+1}"
                            )
                            # plt.show()

                        # tensorboardX write
                        for i, prob in enumerate(attn_list):  # 第i层,每层画四个图
                            prob = prob[0]
                            for j in range(hp.attn_heads):  # 1,2,3,4  第j个
                                # print(f"j * self.args.batch_size - 1:{j * self.args.batch_size - 1}")
                                x = vutils.make_grid(
                                    prob[j] *
                                    255)  # eg:如果是512,94,94  则取127,255,383,511
                                self.train_attn_layer_writer.add_image(
                                    f'Epoch{epoch}_train_attn_layer{i}_head{j + 1}',
                                    x, self.step)

                        for module in self.model.modules(
                        ):  # param.clone().cpu().data.numpy()   .module
                            for name, param in module.named_parameters():
                                self.train_model_param_writer.add_histogram(
                                    f"Epoch{epoch}_train_{name}",
                                    param.clone().cpu().data.numpy(),
                                    self.step)
                        # write model_param todo
                        # for name, param in self.model.module.named_parameters():  # param.clone().cpu().data.numpy()   .module
                        #     self.train_model_param_writer.add_histogram(f"Epoch{epoch}_train_{name}", param.clone().cpu().data.numpy(), self.step)

                    # save model checkpoint
                    if self.step % hp.save_checkpoint == 0:
                        self.bert.checkpoint(self.path.bert_checkpoints_path,
                                             self.step)

                    # save bert model
                    if self.step % hp.save_model == 0:
                        self.save_model(epoch, f"{self.path.bert_path}/bert")
                        self.save_mlm_model(epoch, f"{self.path.mlm_path}/mlm")

                    # evaluate
                    if self.step % hp.save_valid_loss == 0:
                        valid_loss = self.evaluate(epoch, valid_writer)

                valid_loss = self.evaluate(epoch, valid_writer)
                print(
                    f"EP_{epoch}, train_avg_loss={avg_loss}, valid_avg_loss={valid_loss}"
                )

            for writer in train_writer:
                writer.close()
            for writer in valid_writer:
                writer.close()

        except BaseException:
            traceback.print_exc()
            for writer in train_writer:
                writer.close()
            for writer in valid_writer:
                writer.close()

    def evaluate(self, epoch, valid_writer):
        (self.valid_loss_writer, self.valid_attn_layer_writer) = valid_writer
        self.model.eval()

        # Setting the tqdm progress bar
        data_iter = tqdm.tqdm(enumerate(self.test_data),
                              desc="EP_%s:%d" % ("test", epoch),
                              total=len(self.test_data),
                              bar_format="{l_bar}{r_bar}")

        running_loss = 0
        with torch.no_grad():
            for i, data in data_iter:

                self.step += 1

                # 0. batch_data will be sent into the device(GPU or cpu)
                data = {
                    key: value.to(self.device)
                    for key, value in data.items()
                }

                # 1. forward masked_lm model
                mask_lm_output, attn_list = self.model.forward(
                    data["mlm_input"], data["input_position"])

                # 2. NLLLoss of predicting masked token word
                loss = self.criterion(mask_lm_output.transpose(1, 2),
                                      data["mlm_label"])

                # loss
                running_loss += loss.cpu().detach().numpy()
                avg_loss = running_loss / (i + 1)

                # print log
                post_fix = {
                    "epoch": epoch,
                    "iter": i,
                    "step": self.step,
                    "avg_loss": avg_loss,
                    "loss": loss.item()
                }
                if i % self.log_freq == 0:
                    data_iter.write(str(post_fix))

                # writer valid loss
                self.valid_loss_writer.add_scalar('valid_loss', loss,
                                                  self.step)

                if self.step % hp.save_runs == 0:
                    # writer attns_layer
                    for layer, prob in enumerate(attn_list):
                        prob = prob[0]
                        fig, axs = plt.subplots(1, 4, figsize=(20, 10))
                        print("Layer", layer + 1)
                        for h in range(hp.attn_heads):
                            # a = self.model.bert.layers[layer].multihead.attention[0][h].data
                            self.draw(prob[h].cpu().detach().numpy(), [], [],
                                      ax=axs[h])
                        plt.savefig(
                            f"{self.path.plt_train_attn_path}/Epoch{epoch}_valid_step{self.step}_layer{layer + 1}"
                        )
                        # plt.show()

                    # tensorboardX write
                    for i, prob in enumerate(attn_list):  # 第i层,每层画四个图
                        prob = prob[0]
                        for j in range(hp.attn_heads):  # 1,2,3,4  第j个
                            # print(f"j * self.args.batch_size - 1:{j * self.args.batch_size - 1}")
                            x = vutils.make_grid(
                                prob[j] *
                                255)  # eg:如果是512,94,94  则取127,255,383,511
                            self.train_attn_layer_writer.add_image(
                                f'Epoch{epoch}_valid_attn_layer{i}_head{j + 1}',
                                x, self.step)

            print(f"Valid Over!")
            return avg_loss

    def evaluate_and_print(self, vocab):
        self.model.eval()
        # Setting the tqdm progress bar
        data_iter = tqdm.tqdm(enumerate(self.test_data),
                              total=len(self.test_data),
                              bar_format="{l_bar}{r_bar}")

        running_loss = 0
        with torch.no_grad():
            for i, data in data_iter:

                self.step += 1

                # 0. batch_data will be sent into the device(GPU or cpu)
                data = {
                    key: value.to(self.device)
                    for key, value in data.items()
                }

                # 1. forward masked_lm model
                mask_lm_output, attn_list = self.model.forward(
                    data["mlm_input"], data["input_position"])

                for j, q in enumerate(mask_lm_output):
                    #输出MLM input层
                    input = data["mlm_input"][j]
                    char = ''
                    for i, z in enumerate(input):
                        if z.item() == 4:
                            char += '[' + vocab.index2char(
                                q[i].argmax().item()) + ']'
                        else:
                            char += vocab.index2char(z.item())
                    print(char)

                    #label
                    label = data["mlm_label"][j]
                    char = ''
                    for i, z in enumerate(label):

                        char += vocab.index2char(z.item())

                        if z.item() != 0:
                            _, topk = torch.topk(q[i], 10)
                            softmax = nn.Softmax(0)(q[i])
                            info = vocab.index2char(z.item()) + ' ' + (str)(
                                softmax[z.item()]) + '  '
                            for zz in topk:
                                info += ' ' + vocab.index2char(
                                    zz.item()) + ' ' + (str)(
                                        softmax[zz.item()])
                            print(info)
                    print(char)

                # 2. NLLLoss of predicting masked token word
                loss = self.criterion(mask_lm_output.transpose(1, 2),
                                      data["mlm_label"])

                # loss
                running_loss += loss.cpu().detach().numpy()
                avg_loss = running_loss / (i + 1)

                # print log
                post_fix = {
                    "iter": i,
                    "step": self.step,
                    "avg_loss": avg_loss,
                    "loss": loss.item()
                }
                if i % self.log_freq == 0:
                    data_iter.write(str(post_fix))

                # writer valid loss
                self.valid_loss_writer.add_scalar('valid_loss', loss,
                                                  self.step)

                if self.step % hp.save_runs == 0:
                    # writer attns_layer
                    for layer, prob in enumerate(attn_list):
                        prob = prob[0]
                        fig, axs = plt.subplots(1, 4, figsize=(20, 10))
                        print("Layer", layer + 1)
                        for h in range(hp.attn_heads):
                            # a = self.model.bert.layers[layer].multihead.attention[0][h].data
                            self.draw(prob[h].cpu().detach().numpy(), [], [],
                                      ax=axs[h])
                        plt.savefig(
                            f"{self.path.plt_train_attn_path}/Epoch{epoch}_valid_step{self.step}_layer{layer + 1}"
                        )
                        # plt.show()
            print(f"Valid Over!")
            return avg_loss

    def eval(self):
        self.model.eval()

        data_iter = tqdm.tqdm(enumerate(self.test_data),
                              total=len(self.test_data),
                              bar_format="{l_bar}{r_bar}")

        results = []
        with torch.no_grad():
            for i, data in data_iter:
                self.step += 1

                data = {
                    key: value.to(self.device)
                    for key, value in data.items()
                }

                logits = self.model.forward(data["mlm_input"],
                                            data["input_position"])

                accuracy, result = self.calculate(logits, data["mlm_label"])
                results.append(result)

                data_iter.set_description('Iter(acc=%5.3f)' % accuracy)

        total_accuracy = torch.cat(results).mean().item()
        print('Accuracy:', total_accuracy)

    def calculate(self, logits, label_id):
        _, label_pred = logits.max(1)
        result = (label_pred == label_id).float()  # .cpu().numpy()
        accuracy = result.mean()
        return accuracy, result

    def stream(self, message):
        sys.stdout.write(f"\r{message}")

    def draw(self, data, x, y, ax):
        seaborn.heatmap(
            data,
            xticklabels=x,
            square=True,
            yticklabels=y,
            vmin=0.0,
            vmax=1.0,  # 取值0-1
            cbar=False,
            ax=ax)

    def num_params(self, print_out=True):
        params_requires_grad = filter(lambda p: p.requires_grad,
                                      self.model.parameters())
        params_requires_grad = sum(
            [np.prod(p.size()) for p in params_requires_grad])  #/ 1_000_000

        parameters = sum([np.prod(p.size())
                          for p in self.model.parameters()])  #/ 1_000_000
        if print_out:
            print('Trainable total Parameters: %d' % parameters)
            print('Trainable requires_grad Parameters: %d' %
                  params_requires_grad)

    def save_model(self, epoch, file_path="output/bert_trained.model"):
        """
        Saving the current BERT model on file_path

        :param epoch: current epoch number
        :param file_path: model output path which gonna be file_path+"ep%d" % epoch
        :return: final_output_path
        """
        output_path = file_path + "_ep%d.model" % epoch
        torch.save(self.bert.cpu(), output_path)
        self.bert.to(self.device)
        print("EP:%d Model Saved on:" % epoch, output_path)
        return output_path

    def save_mlm_model(self, epoch, file_path="output/mlm_trained.model"):
        """
        Saving the current MLM model on file_path

        :param epoch: current epoch number
        :param file_path: model output path which gonna be file_path+"ep%d" % epoch
        :return: final_output_path
        """
        output_path = file_path + "_ep%d.model" % epoch
        torch.save(self.model.cpu(), output_path)
        self.model.to(self.device)
        print("EP:%d Model Saved on:" % epoch, output_path)
        return output_path

Example #7

Show file

class BERTTrainer:
    """
    BERTTrainer make the pretrained BERT model with two LM training method.

        1. Masked Language Model : 3.3.1 Task #1: Masked LM
        2. Next Sentence prediction : 3.3.2 Task #2: Next Sentence Prediction

    please check the details on README.md with simple example.

    """
    def __init__(self,
                 bert: BERT,
                 vocab_size: int,
                 train_dataloader: DataLoader,
                 test_dataloader: DataLoader = None,
                 lr: float = 1e-4,
                 betas=(0.9, 0.999),
                 weight_decay: float = 0.01,
                 warmup_steps=10000,
                 with_cuda: bool = True,
                 cuda_devices=None,
                 log_freq: int = 10):
        """
        :param bert: BERT model which you want to train
        :param vocab_size: total word vocab size
        :param train_dataloader: train dataset data loader
        :param test_dataloader: test dataset data loader [can be None]
        :param lr: learning rate of optimizer
        :param betas: Adam optimizer betas
        :param weight_decay: Adam optimizer weight decay param
        :param with_cuda: traning with cuda
        :param log_freq: logging frequency of the batch iteration
        """

        # Setup cuda device for BERT training, argument -c, --cuda should be true
        cuda_condition = torch.cuda.is_available() and with_cuda
        self.device = torch.device("cuda:0" if cuda_condition else "cpu")

        # This BERT model will be saved every epoch
        self.bert = bert
        # Initialize the BERT Language Model, with BERT model
        self.model = BERTLM(bert, vocab_size).to(self.device)

        # Distributed GPU training if CUDA can detect more than 1 GPU
        if with_cuda and torch.cuda.device_count() > 1:
            print("Using %d GPUS for BERT" % torch.cuda.device_count())
            self.model = nn.DataParallel(self.model, device_ids=cuda_devices)

        # Setting the train and test data loader
        self.train_data = train_dataloader
        self.test_data = test_dataloader

        # Setting the Adam optimizer with hyper-param
        self.optim = Adam(self.model.parameters(),
                          lr=lr,
                          betas=betas,
                          weight_decay=weight_decay)
        self.optim_schedule = ScheduledOptim(self.optim,
                                             self.bert.hidden,
                                             n_warmup_steps=warmup_steps)

        # Using Negative Log Likelihood Loss function for predicting the masked_token
        self.criterion = nn.NLLLoss(ignore_index=0)

        self.log_freq = log_freq

        print("Total Parameters:",
              sum([p.nelement() for p in self.model.parameters()]))

    def train(self, epoch):
        self.iteration(epoch, self.train_data)

    def test(self, epoch):
        self.iteration(epoch, self.test_data, train=False)

    def iteration(self, epoch, data_loader, train=True):
        """
        loop over the data_loader for training or testing
        if on train status, backward operation is activated
        and also auto save the model every peoch

        :param epoch: current epoch index
        :param data_loader: torch.utils.data.DataLoader for iteration
        :param train: boolean value of is train or test
        :return: None
        """
        str_code = "train" if train else "test"

        # Setting the tqdm progress bar
        data_iter = tqdm.tqdm(enumerate(data_loader),
                              desc="EP_%s:%d" % (str_code, epoch),
                              total=len(data_loader),
                              bar_format="{l_bar}{r_bar}")

        avg_loss = 0.0
        total_correct = 0
        total_close = 0
        total_element = 0

        for i, data in data_iter:
            # 0. batch_data will be sent into the device(GPU or cpu)
            data = {key: value.to(self.device) for key, value in data.items()}

            # 1. forward the next_sentence_prediction and masked_lm model
            next_sent_output, mask_lm_output = self.model.forward(
                data["bert_input"], data["segment_label"])

            # 2-1. NLL(negative log likelihood) loss of is_next classification result
            # next_loss = self.criterion(next_sent_output, data["is_next"])

            # 2-2. NLLLoss of predicting masked token word
            mask_loss = self.criterion(mask_lm_output.transpose(1, 2),
                                       data["bert_label"])

            # 2-3. Adding next_loss and mask_loss : 3.4 Pre-training Procedure
            loss = mask_loss

            # print(data)
            # input()

            # 3. backward and optimization only in train
            if train:
                self.optim_schedule.zero_grad()
                loss.backward()
                self.optim_schedule.step_and_update_lr()

            # next sentence prediction accuracy
            label_mask = torch.where(
                data["bert_label"] > 0,
                torch.ones(*data["bert_label"].shape).to(self.device),
                torch.zeros(*data["bert_label"].shape).to(self.device))
            correct = torch.mul(
                mask_lm_output.argmax(dim=2).eq(data["t1_raw"]),
                label_mask).sum().item()
            close = torch.mul(
                torch.mul(
                    mask_lm_output.argmax(dim=2).ge(data["t1_raw"] - 10),
                    mask_lm_output.argmax(dim=2).le(data["t1_raw"] + 10)),
                label_mask).sum().item()
            avg_loss += loss.item()
            total_correct += correct
            total_close += close
            total_element += label_mask.sum().item()
            # print(data['bert_label'], mask_lm_output.argmax(dim=2), correct)
            # input()

            post_fix = {
                "epoch": epoch,
                "iter": i,
                "avg_loss": avg_loss / (i + 1),
                "avg_acc": total_correct / total_element,
                "avg_clo": total_close / total_element,
                "loss": loss.item()
            }

            if i % self.log_freq == 0:
                data_iter.write(str(post_fix))

        print("EP%d_%s, avg_loss=" % (epoch, str_code),
              avg_loss / len(data_iter), "total_acc=",
              total_correct * 100.0 / total_element, "total_clo=",
              total_close * 100.0 / total_element)

    def save(self, epoch, file_path="output/bert_trained.model"):
        """
        Saving the current BERT model on file_path

        :param epoch: current epoch number
        :param file_path: model output path which gonna be file_path+"ep%d" % epoch
        :return: final_output_path
        """
        output_path = file_path + ".ep%d" % epoch
        torch.save(self.bert.cpu(), output_path)
        self.bert.to(self.device)
        print("EP:%d Model Saved on:" % epoch, output_path)
        return output_path

Example #8

Show file

File: pretrain.py Project: ncu-dart/Adjusting-Word-Embeddings

class BERTTrainer:
    """
    BERTTrainer make the pretrained BERT model with two LM training method.

        1. Masked Language Model : 3.3.1 Task #1: Masked LM
        2. Next Sentence prediction : 3.3.2 Task #2: Next Sentence Prediction

    please check the details on README.md with simple example.

    """

    def __init__(self, bert: BERT, vocab_size: int, seq_len: int,
                 train_dataloader: DataLoader,
                 lr: float = 1e-4, betas=(0.9, 0.999), weight_decay: float = 0.01, warmup_steps=10000,
                 with_cuda: bool = True, cuda_devices=None, log_freq: int = 100):
        """
        :param bert: BERT model which you want to train
        :param vocab_size: total word vocab size
        :param train_dataloader: train dataset data loader
        :param lr: learning rate of optimizer
        :param betas: Adam optimizer betas
        :param weight_decay: Adam optimizer weight decay param
        :param with_cuda: traning with cuda
        :param log_freq: logging frequency of the batch iteration
        """
        self.seq_len = seq_len

        # Setup cuda device for BERT training, argument -c, --cuda should be true
        cuda_condition = torch.cuda.is_available() and with_cuda
        self.device = torch.device("cuda:0" if cuda_condition else "cpu")

        # This BERT model will be saved every epoch
        self.bert = bert
        # Initialize the BERT Language Model, with BERT model
        self.model = BERTLM(bert, vocab_size).to(self.device)

        # Distributed GPU training if CUDA can detect more than 1 GPU
        if with_cuda and torch.cuda.device_count() > 1:
            print("Using %d GPUS for BERT" % torch.cuda.device_count())
            self.model = nn.DataParallel(self.model, device_ids=cuda_devices)

        # Setting the train data loader
        self.train_data = train_dataloader

        # Setting the Adam optimizer with hyper-param
        self.optim = Adam(self.model.parameters(), lr=lr, betas=betas, weight_decay=weight_decay)
        self.optim_schedule = ScheduledOptim(self.optim, self.bert.hidden, n_warmup_steps=warmup_steps)

        # Using Negative Log Likelihood Loss function for predicting the masked_token
        self.criterion = nn.MSELoss()

        self.log_freq = log_freq

        print("Total Parameters:", sum([p.nelement() for p in self.model.parameters()]))

    def train(self, epoch):
        self.iteration(epoch, self.train_data)

    def test(self, epoch):
        self.iteration(epoch, self.train_data, train=False)

    def iteration(self, epoch, data_loader, train=True):
        """
        loop over the data_loader for training or testing
        if on train status, backward operation is activated
        and also auto save the model every peoch

        :param epoch: current epoch index
        :param data_loader: torch.utils.data.DataLoader for iteration
        :param train: boolean value of is train or test
        :return: None
        """
        str_code = "train" if train else "test"

        # Setting the tqdm progress bar
        data_iter = tqdm(enumerate(data_loader),
                              desc="EP_%s:%d" % (str_code, epoch),
                              total=len(data_loader),
                              bar_format="{l_bar}{r_bar}")

        avg_loss = 0.0

        for i, data in data_iter:
            # 0. batch_data will be sent into the device(GPU or cpu)
            data = {key: value.to(self.device) for key, value in data.items()}

            # 1. forward the next_sentence_prediction and masked_lm model
            bert_output, original_emb = self.model.forward(data["bert_input"])
            
            # 2. MSELoss of predicting masked token word
            cos = nn.CosineSimilarity(dim=2, eps=1e-6)

            key = bert_output[:,0,:].unsqueeze(1).repeat(1, self.seq_len, 1)
            label = data["syn_label"].type(torch.FloatTensor).cuda()
            
            loss_1 = (torch.mul(((bert_output-original_emb)**2).mean(dim=2), torch.abs(label)).sum(dim=1)/torch.abs(label).sum(dim=1)).mean()
            loss_2 = ((torch.sub(target,torch.mul(cos(bert_output, key), label))).sum(dim=1)/torch.abs(label).sum(dim=1)).mean()
            
            loss = 0.7 * loss_1 + 0.3 * loss_2

            # 3. backward and optimization only in train
            if train:
                self.optim_schedule.zero_grad()
                loss.backward()
                self.optim_schedule.step_and_update_lr()
            
            else:
                with open('../output/embeddings/raw/result_input_iter{}.pkl'.format(i), "wb") as fb:
                    pickle.dump(data["bert_input"], fb)
                with open('../output/embeddings/raw/result_output_iter{}.pkl'.format(i), "wb") as fb:
                    pickle.dump(bert_output, fb)

            # next sentence prediction accuracy
            avg_loss += loss.item()

            post_fix = {
                "epoch": epoch,
                "iter": i,
                "avg_loss": avg_loss / (i + 1),
                "loss": loss.item()
            }

            if i % self.log_freq == 0:
                data_iter.write(str(post_fix))

        print("EP%d_%s, avg_loss=" % (epoch, str_code), avg_loss / len(data_iter))

    def save(self, epoch, file_path="output/bert_trained.model"):
        """
        Saving the current BERT model on file_path

        :param epoch: current epoch number
        :param file_path: model output path which gonna be file_path+"ep%d" % epoch
        :return: final_output_path
        """
        output_path = file_path + ".ep%d" % epoch
        torch.save(self.bert.cpu(), output_path)
        self.bert.to(self.device)
        print("EP:%d Model Saved on:" % epoch, output_path)
        return output_path

Example #9

Show file

File: eval.py Project: Lily1994/BTH

    with only mask_loss and save features with this function.
    '''
    num_sample = 45585 # number of training videos
    new_feats = np.zeros((num_sample,hidden_size),dtype = np.float32)
    rem = num_sample%test_batch_size
    eval_loader = get_eval_loader(train_feat_path,batch_size=test_batch_size)   
    batch_num = len(eval_loader)
    for i, data in enumerate(eval_loader): 
        data = {key: value.cuda() for key, value in data.items()}
        _,_,x = model.forward(data["visual_word"])
        feat = torch.mean(x,1)
        if i == batch_num-1:
            new_feats[i*test_batch_size:,:] = feat[:rem,:].data.cpu().numpy()
        else:
            new_feats[i*test_batch_size:(i+1)*test_batch_size,:] = feat.data.cpu().numpy()
    h5 = h5py.File(latent_feat_path, 'w')
    h5.create_dataset('feats', data = new_feats)
    h5.close()


if __name__ == '__main__':  
    model = BERTLM(feature_size).cuda()
    model.load_state_dict(torch.load(file_path + '/9288.pth'))
    h5_file = h5py.File(test_feat_path, 'r')
    video_feats = h5_file['feats']
    num_sample = len(video_feats)
    print num_sample
    model.eval()
    evaluate(model, test_feat_path, label_path ,num_sample)
    # save_nf(model)

Example #10

Show file

best_optimizer_pth_path = file_path + '/best_optimizer.pth'
optimizer_pth_path = file_path + '/optimizer.pth'
print('Learning rate: %.4f' % lr)
infos = {}
infos_best = {}
histories = {}
if use_checkpoint is True and os.path.isfile(
        os.path.join(file_path, 'infos.pkl')):
    with open(os.path.join(file_path, 'infos.pkl')) as f:
        infos = pickle.load(f)

    if os.path.isfile(os.path.join(file_path, 'histories.pkl')):
        with open(os.path.join(file_path, 'histories.pkl')) as f:
            histories = pickle.load(f)

model = BERTLM(feature_size).cuda()
train_loader = get_train_loader(train_feat_path,
                                sim_path,
                                batch_size,
                                shuffle=True)

itera = 0
epoch = 0

if use_checkpoint:
    model.load_state_dict(torch.load(file_path + '/9288.pth'))
    itera = infos.get('iter', 0)
    epoch = infos.get('epoch', 0)
optimizer = Adam(model.parameters(), lr=lr)
optim_schedule = ScheduledOptim(optimizer, hidden_size, n_warmup_steps=10000)
if os.path.exists(best_optimizer_pth_path) and use_checkpoint:

Example #11

Show file

File: pretrain.py Project: dery-hit/BERT-pytorch

class BERTTrainer:
    def __init__(self, bert: BERT, vocab_size, train_dataloader, test_dataloader=None):
        self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

        self.bert = bert
        self.model = BERTLM(bert, vocab_size).to(self.device)

        if torch.cuda.device_count() > 1:
            print("Using %d GPUS for BERT" % torch.cuda.device_count())
            self.model = nn.DataParallel(self.model)

        self.train_data = train_dataloader
        self.test_data = test_dataloader

        self.optim = Adam(self.model.parameters(), lr=1e-4, betas=(0.9, 0.999), weight_decay=0.01)
        self.criterion = nn.NLLLoss(ignore_index=0)

    def train(self, epoch):
        self.iteration(epoch, self.train_data)

    def test(self, epoch):
        self.iteration(epoch, self.test_data, train=False)

    def iteration(self, epoch, data_loader, train=True):
        str_code = "train" if train else "test"
        data_iter = tqdm.tqdm(enumerate(data_loader),
                              desc="EP_%s:%d" % (str_code, epoch),
                              total=len(data_loader),
                              bar_format="{l_bar}{r_bar}")

        avg_loss = 0.0
        total_correct = 0
        total_element = 0

        for i, data in data_iter:
            data = {key: value.to(self.device) for key, value in data.items()}

            next_sent_output, mask_lm_output = self.model.forward(data["bert_input"], data["segment_label"])
            next_loss = self.criterion(next_sent_output, data["is_next"])
            mask_loss = self.criterion(mask_lm_output.transpose(1, 2), data["bert_label"])
            loss = next_loss + mask_loss

            correct = next_sent_output.argmax(dim=-1).eq(data["is_next"]).sum().item()

            avg_loss += loss.item()
            total_correct += correct
            total_element += data["is_next"].nelement()

            if train:
                self.optim.zero_grad()
                loss.backward()
                self.optim.step()

            post_fix = {
                "epoch": epoch,
                "avg_loss": avg_loss / (i + 1),
                "avg_acc": total_correct / total_element * 100
            }
            data_iter.set_postfix(post_fix)

        print("EP%d_%s, avg_loss=" % (epoch, str_code), avg_loss / len(data_iter), "total_acc=",
              total_correct * 100.0 / total_element)

    def save(self, output_dir, epoch, file_name="bert_trained_ep%d.model"):
        if isinstance(self.model, nn.DataParallel):
            model = self.model.module
        else:
            model = self.model

        with open(os.path.join(output_dir, file_name % epoch), "wb") as f:
            torch.save(model.cpu(), f)

        model.to(self.device)

Example #12

Show file

File: trainer.py Project: cutoutsy/GameBERT

class BERTTrainer(BasicTrainer):
    def __init__(self, bert: BERT, vocab_size: int, epochs: int,
                 tensorboard_log_dir: str, output_path: str,
                 train_dataloader: DataLoader,
                 lr: float = 1e-4, betas=(0.9, 0.999), weight_decay: float = 0.01, warmup_steps=10000,
                 with_cuda: bool = True, log_freq: int = 10, save_steps: int = -1):

        super(BERTTrainer, self).__init__(bert=bert, epochs=epochs,
                                               tensorboard_log_dir=tensorboard_log_dir,
                                               output_path=output_path,
                                               train_dataloader=train_dataloader,
                                               with_cuda=with_cuda, log_freq=log_freq, save_steps=save_steps)

        self.model = BERTLM(bert, vocab_size).to(self.device)

        if with_cuda and torch.cuda.device_count() > 1:
            print("Using %d GPUS for BERT" % torch.cuda.device_count())
            self.model = nn.DataParallel(self.model)

        # Setting the Adam optimizer with hyper-param
        self.optim = Adam(self.model.parameters(), lr=lr, betas=betas, weight_decay=weight_decay)
        self.optim_schedule = ScheduledOptim(self.optim, self.bert.hidden, n_warmup_steps=warmup_steps)
        print("Total Parameters:", sum([p.nelement() for p in self.model.parameters()]))

    def iteration(self, epoch, data_loader):
        data_iter = tqdm.tqdm(enumerate(data_loader),
                              desc="EP:%d" % epoch,
                              total=self.n_batches,
                              bar_format="{l_bar}{r_bar}",
                              disable=False)

        avg_loss = 0.0
        for i, data in data_iter:
            global_step = epoch * self.n_batches + i + 1

            data = {key: value.to(self.device) for key, value in data.items()}

            mask_lm_output = self.model.forward(data["bert_input"])

            mask_loss = self.criterion(mask_lm_output.transpose(1, 2), data["bert_label"])

            loss = mask_loss

            self.optim_schedule.zero_grad()
            loss.backward()
            self.optim_schedule.step_and_update_lr()

            avg_loss += loss.item()

            self.tensorborad_writer.add_scalar("Masked_language_model loss", mask_loss.item(), global_step)
            self.tensorborad_writer.add_scalar("Average loss in epoch", avg_loss / (i + 1), global_step)

            post_fix = {
                "epoch": epoch,
                "iter": i+1,
                "avg_loss": avg_loss / (i + 1),
                "loss": loss.item()
            }

            if (i+1) % self.log_freq == 0:
                data_iter.write(str(post_fix))

            if self.save_steps > 0 and ((i + 1) % self.save_steps == 0 or (i + 1) == self.n_batches):
                self.save(epoch, i + 1)