class BERTTrainer:
    """
    BERTTrainer make the pretrained BERT model with two LM training method.

        1. Masked Language Model : 3.3.1 Task #1: Masked LM

    please check the details on README.md with simple example.

    """
    def __init__(self,
                 bert: BERT,
                 vocab_size: int,
                 model: BERTLM,
                 train_dataloader: DataLoader,
                 test_dataloader: DataLoader = None,
                 with_cuda: bool = True,
                 cuda_devices=None,
                 log_freq: int = hp.log_freq,
                 args=None,
                 global_step=0,
                 path=None):
        """
        :param bert: MLM model which you want to train
        :param vocab_size: total word vocab size
        :param train_dataloader: train dataset data loader
        :param test_dataloader: test dataset data loader [can be None]
        :param lr: learning rate of optimizer
        :param betas: Adam optimizer betas
        :param weight_decay: Adam optimizer weight decay param
        :param with_cuda: traning with cuda
        :param log_freq: logging frequency of the batch iteration
        """

        self.args = args
        self.step = global_step
        self.path = path

        # Setup cuda device for BERT training, argument -c, --cuda should be true
        cuda_condition = torch.cuda.is_available() and with_cuda
        self.device = torch.device("cuda:0,1,2,3" if cuda_condition else "cpu")

        # This BERT model will be saved every epoch
        self.bert = bert
        # Initialize the BERT Language Model, with BERT model
        if model is None:
            self.model = BERTLM(bert, vocab_size).to(self.device)
        else:
            self.model = model
        #self.model = BERTLM(bert, vocab_size).to(self.device)
        #self.model = torch.load('./output/model_mlm/mlm_ep2.model')
        #self.model = BertForSA(bert).to(self.device)
        # Distributed GPU training if CUDA can detect more than 1 GPU
        if with_cuda and torch.cuda.device_count() > 1:
            print("Using %d GPUS for BERT" % torch.cuda.device_count())
            self.model = nn.DataParallel(self.model, device_ids=cuda_devices)

        # Setting the train and test data loader
        self.train_data = train_dataloader
        self.test_data = test_dataloader

        # Setting the Adam optimizer with hyper-param
        total_steps = hp.epochs * len(self.train_data)
        self.optimer = optim4GPU(self.model, total_steps)

        # Using Negative Log Likelihood Loss function for predicting the masked_token
        self.criterion = nn.NLLLoss(ignore_index=0)

        # Writer
        self.log_freq = log_freq
        # train
        self.train_loss_writer = SummaryWriter(
            f'{self.path.runs_path}/train/train_loss')
        self.train_attn_layer_writer = SummaryWriter(
            f'{self.path.runs_path}/train/attn_layer')
        self.train_model_param_writer = SummaryWriter(
            f'{self.path.runs_path}/train/model_param')
        # valid
        self.valid_loss_writer = SummaryWriter(
            f'{self.path.runs_path}/valid/valid_loss')
        self.valid_attn_layer_writer = SummaryWriter(
            f'{self.path.runs_path}/valid/valid_attn_layer')

        self.num_params()

    def train(self):

        train_writer = (self.train_loss_writer, self.train_attn_layer_writer,
                        self.train_model_param_writer)
        valid_writer = (self.valid_loss_writer, self.valid_attn_layer_writer)
        try:
            for epoch in range(hp.epochs):

                # Setting the tqdm progress bar
                data_iter = tqdm.tqdm(enumerate(self.train_data),
                                      desc="EP_%s:%d" % ("train", epoch),
                                      total=len(self.train_data),
                                      bar_format="{l_bar}{r_bar}")

                running_loss = 0
                for i, data in data_iter:

                    self.step += 1

                    # 0. batch_data will be sent into the device(GPU or cpu)
                    data = {
                        key: value.to(self.device)
                        for key, value in data.items()
                    }

                    # 1. forward masked_lm model
                    mask_lm_output, attn_list = self.model.forward(
                        data["mlm_input"], data["input_position"])

                    # 2. NLLLoss of predicting masked token word
                    self.optimer.zero_grad()
                    loss = self.criterion(mask_lm_output.transpose(1, 2),
                                          data["mlm_label"])

                    # 3. backward and optimization only in train
                    loss.backward()
                    self.optimer.step()

                    # loss
                    running_loss += loss.item()
                    avg_loss = running_loss / (i + 1)

                    # write log
                    post_fix = {
                        "epoch": epoch,
                        "iter": i,
                        "step": self.step,
                        "avg_loss": avg_loss,
                        "loss": loss.item()
                    }
                    if i % self.log_freq == 0:
                        data_iter.write(str(post_fix))

                    # writer train loss
                    if self.step % hp.save_train_loss == 0:
                        self.train_loss_writer.add_scalar(
                            'train_loss', loss, self.step)

                    # writer
                    if self.step % hp.save_runs == 0 and data["mlm_input"].size(
                            0) == hp.batch_size:  # 不足batch数量则不采样

                        # writer attns_layer
                        for layer, prob in enumerate(attn_list):
                            prob = prob[0]
                            fig, axs = plt.subplots(1, 4, figsize=(20, 10))
                            print("Layer", layer + 1)
                            for h in range(hp.attn_heads):
                                # a = self.model.bert.layers[layer].multihead.attention[0][h].data
                                self.draw(prob[h].cpu().detach().numpy(), [],
                                          [],
                                          ax=axs[h])
                            plt.savefig(
                                f"{self.path.plt_train_attn_path}/Epoch{epoch}_train_step{self.step}_layer{layer+1}"
                            )
                            # plt.show()

                        # tensorboardX write
                        for i, prob in enumerate(attn_list):  # 第i层,每层画四个图
                            prob = prob[0]
                            for j in range(hp.attn_heads):  # 1,2,3,4  第j个
                                # print(f"j * self.args.batch_size - 1:{j * self.args.batch_size - 1}")
                                x = vutils.make_grid(
                                    prob[j] *
                                    255)  # eg:如果是512,94,94  则取127,255,383,511
                                self.train_attn_layer_writer.add_image(
                                    f'Epoch{epoch}_train_attn_layer{i}_head{j + 1}',
                                    x, self.step)

                        for module in self.model.modules(
                        ):  # param.clone().cpu().data.numpy()   .module
                            for name, param in module.named_parameters():
                                self.train_model_param_writer.add_histogram(
                                    f"Epoch{epoch}_train_{name}",
                                    param.clone().cpu().data.numpy(),
                                    self.step)
                        # write model_param todo
                        # for name, param in self.model.module.named_parameters():  # param.clone().cpu().data.numpy()   .module
                        #     self.train_model_param_writer.add_histogram(f"Epoch{epoch}_train_{name}", param.clone().cpu().data.numpy(), self.step)

                    # save model checkpoint
                    if self.step % hp.save_checkpoint == 0:
                        self.bert.checkpoint(self.path.bert_checkpoints_path,
                                             self.step)

                    # save bert model
                    if self.step % hp.save_model == 0:
                        self.save_model(epoch, f"{self.path.bert_path}/bert")
                        self.save_mlm_model(epoch, f"{self.path.mlm_path}/mlm")

                    # evaluate
                    if self.step % hp.save_valid_loss == 0:
                        valid_loss = self.evaluate(epoch, valid_writer)

                valid_loss = self.evaluate(epoch, valid_writer)
                print(
                    f"EP_{epoch}, train_avg_loss={avg_loss}, valid_avg_loss={valid_loss}"
                )

            for writer in train_writer:
                writer.close()
            for writer in valid_writer:
                writer.close()

        except BaseException:
            traceback.print_exc()
            for writer in train_writer:
                writer.close()
            for writer in valid_writer:
                writer.close()

    def evaluate(self, epoch, valid_writer):
        (self.valid_loss_writer, self.valid_attn_layer_writer) = valid_writer
        self.model.eval()

        # Setting the tqdm progress bar
        data_iter = tqdm.tqdm(enumerate(self.test_data),
                              desc="EP_%s:%d" % ("test", epoch),
                              total=len(self.test_data),
                              bar_format="{l_bar}{r_bar}")

        running_loss = 0
        with torch.no_grad():
            for i, data in data_iter:

                self.step += 1

                # 0. batch_data will be sent into the device(GPU or cpu)
                data = {
                    key: value.to(self.device)
                    for key, value in data.items()
                }

                # 1. forward masked_lm model
                mask_lm_output, attn_list = self.model.forward(
                    data["mlm_input"], data["input_position"])

                # 2. NLLLoss of predicting masked token word
                loss = self.criterion(mask_lm_output.transpose(1, 2),
                                      data["mlm_label"])

                # loss
                running_loss += loss.cpu().detach().numpy()
                avg_loss = running_loss / (i + 1)

                # print log
                post_fix = {
                    "epoch": epoch,
                    "iter": i,
                    "step": self.step,
                    "avg_loss": avg_loss,
                    "loss": loss.item()
                }
                if i % self.log_freq == 0:
                    data_iter.write(str(post_fix))

                # writer valid loss
                self.valid_loss_writer.add_scalar('valid_loss', loss,
                                                  self.step)

                if self.step % hp.save_runs == 0:
                    # writer attns_layer
                    for layer, prob in enumerate(attn_list):
                        prob = prob[0]
                        fig, axs = plt.subplots(1, 4, figsize=(20, 10))
                        print("Layer", layer + 1)
                        for h in range(hp.attn_heads):
                            # a = self.model.bert.layers[layer].multihead.attention[0][h].data
                            self.draw(prob[h].cpu().detach().numpy(), [], [],
                                      ax=axs[h])
                        plt.savefig(
                            f"{self.path.plt_train_attn_path}/Epoch{epoch}_valid_step{self.step}_layer{layer + 1}"
                        )
                        # plt.show()

                    # tensorboardX write
                    for i, prob in enumerate(attn_list):  # 第i层,每层画四个图
                        prob = prob[0]
                        for j in range(hp.attn_heads):  # 1,2,3,4  第j个
                            # print(f"j * self.args.batch_size - 1:{j * self.args.batch_size - 1}")
                            x = vutils.make_grid(
                                prob[j] *
                                255)  # eg:如果是512,94,94  则取127,255,383,511
                            self.train_attn_layer_writer.add_image(
                                f'Epoch{epoch}_valid_attn_layer{i}_head{j + 1}',
                                x, self.step)

            print(f"Valid Over!")
            return avg_loss

    def evaluate_and_print(self, vocab):
        self.model.eval()
        # Setting the tqdm progress bar
        data_iter = tqdm.tqdm(enumerate(self.test_data),
                              total=len(self.test_data),
                              bar_format="{l_bar}{r_bar}")

        running_loss = 0
        with torch.no_grad():
            for i, data in data_iter:

                self.step += 1

                # 0. batch_data will be sent into the device(GPU or cpu)
                data = {
                    key: value.to(self.device)
                    for key, value in data.items()
                }

                # 1. forward masked_lm model
                mask_lm_output, attn_list = self.model.forward(
                    data["mlm_input"], data["input_position"])

                for j, q in enumerate(mask_lm_output):
                    #输出MLM input层
                    input = data["mlm_input"][j]
                    char = ''
                    for i, z in enumerate(input):
                        if z.item() == 4:
                            char += '[' + vocab.index2char(
                                q[i].argmax().item()) + ']'
                        else:
                            char += vocab.index2char(z.item())
                    print(char)

                    #label
                    label = data["mlm_label"][j]
                    char = ''
                    for i, z in enumerate(label):

                        char += vocab.index2char(z.item())

                        if z.item() != 0:
                            _, topk = torch.topk(q[i], 10)
                            softmax = nn.Softmax(0)(q[i])
                            info = vocab.index2char(z.item()) + ' ' + (str)(
                                softmax[z.item()]) + '  '
                            for zz in topk:
                                info += ' ' + vocab.index2char(
                                    zz.item()) + ' ' + (str)(
                                        softmax[zz.item()])
                            print(info)
                    print(char)

                # 2. NLLLoss of predicting masked token word
                loss = self.criterion(mask_lm_output.transpose(1, 2),
                                      data["mlm_label"])

                # loss
                running_loss += loss.cpu().detach().numpy()
                avg_loss = running_loss / (i + 1)

                # print log
                post_fix = {
                    "iter": i,
                    "step": self.step,
                    "avg_loss": avg_loss,
                    "loss": loss.item()
                }
                if i % self.log_freq == 0:
                    data_iter.write(str(post_fix))

                # writer valid loss
                self.valid_loss_writer.add_scalar('valid_loss', loss,
                                                  self.step)

                if self.step % hp.save_runs == 0:
                    # writer attns_layer
                    for layer, prob in enumerate(attn_list):
                        prob = prob[0]
                        fig, axs = plt.subplots(1, 4, figsize=(20, 10))
                        print("Layer", layer + 1)
                        for h in range(hp.attn_heads):
                            # a = self.model.bert.layers[layer].multihead.attention[0][h].data
                            self.draw(prob[h].cpu().detach().numpy(), [], [],
                                      ax=axs[h])
                        plt.savefig(
                            f"{self.path.plt_train_attn_path}/Epoch{epoch}_valid_step{self.step}_layer{layer + 1}"
                        )
                        # plt.show()
            print(f"Valid Over!")
            return avg_loss

    def eval(self):
        self.model.eval()

        data_iter = tqdm.tqdm(enumerate(self.test_data),
                              total=len(self.test_data),
                              bar_format="{l_bar}{r_bar}")

        results = []
        with torch.no_grad():
            for i, data in data_iter:
                self.step += 1

                data = {
                    key: value.to(self.device)
                    for key, value in data.items()
                }

                logits = self.model.forward(data["mlm_input"],
                                            data["input_position"])

                accuracy, result = self.calculate(logits, data["mlm_label"])
                results.append(result)

                data_iter.set_description('Iter(acc=%5.3f)' % accuracy)

        total_accuracy = torch.cat(results).mean().item()
        print('Accuracy:', total_accuracy)

    def calculate(self, logits, label_id):
        _, label_pred = logits.max(1)
        result = (label_pred == label_id).float()  # .cpu().numpy()
        accuracy = result.mean()
        return accuracy, result

    def stream(self, message):
        sys.stdout.write(f"\r{message}")

    def draw(self, data, x, y, ax):
        seaborn.heatmap(
            data,
            xticklabels=x,
            square=True,
            yticklabels=y,
            vmin=0.0,
            vmax=1.0,  # 取值0-1
            cbar=False,
            ax=ax)

    def num_params(self, print_out=True):
        params_requires_grad = filter(lambda p: p.requires_grad,
                                      self.model.parameters())
        params_requires_grad = sum(
            [np.prod(p.size()) for p in params_requires_grad])  #/ 1_000_000

        parameters = sum([np.prod(p.size())
                          for p in self.model.parameters()])  #/ 1_000_000
        if print_out:
            print('Trainable total Parameters: %d' % parameters)
            print('Trainable requires_grad Parameters: %d' %
                  params_requires_grad)

    def save_model(self, epoch, file_path="output/bert_trained.model"):
        """
        Saving the current BERT model on file_path

        :param epoch: current epoch number
        :param file_path: model output path which gonna be file_path+"ep%d" % epoch
        :return: final_output_path
        """
        output_path = file_path + "_ep%d.model" % epoch
        torch.save(self.bert.cpu(), output_path)
        self.bert.to(self.device)
        print("EP:%d Model Saved on:" % epoch, output_path)
        return output_path

    def save_mlm_model(self, epoch, file_path="output/mlm_trained.model"):
        """
        Saving the current MLM model on file_path

        :param epoch: current epoch number
        :param file_path: model output path which gonna be file_path+"ep%d" % epoch
        :return: final_output_path
        """
        output_path = file_path + "_ep%d.model" % epoch
        torch.save(self.model.cpu(), output_path)
        self.model.to(self.device)
        print("EP:%d Model Saved on:" % epoch, output_path)
        return output_path
Example #2
0
class BERTTrainer:
    """
    BERTTrainer make the pretrained BERT model with two LM training method.

        1. Masked Language Model : 3.3.1 Task #1: Masked LM
        2. Next Sentence prediction : 3.3.2 Task #2: Next Sentence Prediction

    please check the details on README.md with simple example.

    """
    def __init__(self,
                 bert: BERT,
                 vocab_size: int,
                 train_dataloader: DataLoader,
                 test_dataloader: DataLoader = None,
                 lr: float = 1e-4,
                 betas=(0.9, 0.999),
                 weight_decay: float = 0.01,
                 warmup_steps=10000,
                 with_cuda: bool = True,
                 cuda_devices=None,
                 log_freq: int = 10,
                 pad_index=0):
        """
        :param bert: BERT model which you want to train
        :param vocab_size: total word vocab size
        :param train_dataloader: train dataset data loader
        :param test_dataloader: test dataset data loader [can be None]
        :param lr: learning rate of optimizer
        :param betas: Adam optimizer betas
        :param weight_decay: Adam optimizer weight decay param
        :param with_cuda: traning with cuda
        :param log_freq: logging frequency of the batch iteration
        """

        # Setup cuda device for BERT training, argument -c, --cuda should be true
        cuda_condition = torch.cuda.is_available() and with_cuda
        self.device = torch.device("cuda:0" if cuda_condition else "cpu")

        # This BERT model will be saved every epoch
        self.bert = bert
        # Initialize the BERT Language Model, with BERT model
        self.model = BERTLM(bert, vocab_size).to(self.device)

        # Distributed GPU training if CUDA can detect more than 1 GPU
        if with_cuda and torch.cuda.device_count() > 1:
            print("Using %d GPUS for BERT" % torch.cuda.device_count())
            self.model = nn.DataParallel(self.model, device_ids=cuda_devices)

        # Setting the train and test data loader
        self.train_data = train_dataloader
        self.test_data = test_dataloader
        self.pad_index = pad_index
        # Setting the Adam optimizer with hyper-param
        # self.optim = Adam(self.model.parameters(), lr=lr,
        #                   betas=betas, weight_decay=weight_decay)
        # self.optim_schedule = ScheduledOptim(
        #     self.optim, self.bert.hidden, n_warmup_steps=warmup_steps)
        self.optim = SGD(self.model.parameters(), lr=lr, momentum=0.9)
        # Using Negative Log Likelihood Loss function for predicting the masked_token
        self.criterion = nn.NLLLoss(ignore_index=self.pad_index)

        self.log_freq = log_freq

        print("Total Parameters:",
              sum([p.nelement() for p in self.model.parameters()]))

    def train(self, epoch):
        self.model.train()
        return self.iteration(epoch, self.train_data)

    def test(self, epoch):
        self.model.eval()
        return self.iteration(epoch, self.test_data, train=False)

    def iteration(self, epoch, data_loader, train=True):
        """
        loop over the data_loader for training or testing
        if on train status, backward operation is activated
        and also auto save the model every peoch

        :param epoch: current epoch index
        :param data_loader: torch.utils.data.DataLoader for iteration
        :param train: boolean value of is train or test
        :return: None
        """
        # pdb.set_trace()
        str_code = "train" if train else "test"

        # Setting the tqdm progress bar
        data_iter = tqdm.tqdm(enumerate(data_loader),
                              desc="EP_%s:%d" % (str_code, epoch),
                              total=len(data_loader),
                              bar_format="{l_bar}{r_bar}")

        avg_loss = 0.0
        total_correct = 0
        total_element = 0

        def calculate_iter(data):
            next_sent_output, mask_lm_output = self.model.forward(
                data["bert_input"], data["segment_label"], data["adj_mat"],
                train)
            mask_loss = self.criterion(mask_lm_output.transpose(1, 2),
                                       data["bert_label"])
            loss = mask_loss
            return loss

        for i, data in data_iter:
            # 0. batch_data will be sent into the device(GPU or cpu)
            # pdb.set_trace()
            data = data[0]
            data = {key: value.to(self.device) for key, value in data.items()}

            if train:
                loss = calculate_iter(data)
            else:
                with torch.no_grad():
                    loss = calculate_iter(data)
            # 1. forward the next_sentence_prediction and masked_lm model
            # next_sent_output, mask_lm_output = self.model.forward(
            #     data["bert_input"], data["segment_label"], data["adj_mat"], train)
            # # pdb.set_trace()
            # # 2-1. NLL(negative log likelihood) loss of is_next classification result
            # # next_loss = self.criterion(next_sent_output, data["is_next"])

            # # 2-2. NLLLoss of predicting masked token word
            # mask_loss = self.criterion(
            #     mask_lm_output.transpose(1, 2), data["bert_label"])
            # # pdb.set_trace()
            # # 2-3. Adding next_loss and mask_loss : 3.4 Pre-training Procedure
            # # loss = next_loss + mask_loss
            # loss = mask_loss

            # 3. backward and optimization only in train
            if train:
                self.optim.zero_grad()
                loss.backward()
                # self.optim.step_and_update_lr()
                self.optim.step()
            # pdb.set_trace()
            # mlm prediction accuracy
            # correct = next_sent_output.argmax(
            #     dim=-1).eq(data["is_next"]).sum().item()
            correct = 0
            elements = 0
            for labels, t_labels in zip(mask_lm_output.argmax(dim=-1),
                                        data["bert_label"]):
                correct += sum([
                    1 if l == t and t != self.pad_index else 0
                    for l, t in zip(labels, t_labels)
                ])
                elements += sum([1 for t in t_labels if t != self.pad_index])
            # next sentence prediction accuracy
            # correct = next_sent_output.argmax(
            #     dim=-1).eq(data["is_next"]).sum().item()
            avg_loss += loss.item()
            total_correct += correct
            # total_element += data["is_next"].nelement()
            total_element += elements

            post_fix = {
                "epoch": epoch,
                "iter": i,
                "avg_loss": avg_loss / (i + 1),
                "avg_acc": total_correct / total_element * 100,
                "loss": loss.item()
            }

            if i % self.log_freq == 0 and i != 0:
                data_iter.write(str(post_fix))

        print("EP%d_%s, avg_loss=" % (epoch, str_code),
              avg_loss / len(data_iter), "total_acc=",
              total_correct * 100.0 / total_element)
        return avg_loss / len(data_iter)

    def save(self, epoch, file_path="output/bert_trained.model"):
        """
        Saving the current BERT model on file_path

        :param epoch: current epoch number
        :param file_path: model output path which gonna be file_path+"ep%d" % epoch
        :return: final_output_path
        """
        # output_path = file_path + ".ep%d" % epoch
        # torch.save(self.bert.cpu(), output_path)
        # self.bert.to(self.device)
        # print("EP:%d Model Saved on:" % epoch, output_path)
        # return output_path

        output_path = file_path  # + ".ep%d" % epoch
        # if self.updated:
        #     return output_path
        # torch.save(self.bert.cpu(), output_path)
        torch.save(
            {
                'epoch': epoch,
                'model_state_dict': self.model.state_dict()
                # 'optimizer_state_dict': optimizer.state_dict(),
                # 'loss': loss,
                # ...
            },
            output_path)
        # self.bert.to(self.device)
        print("EP:%d Model Saved on:" % epoch, output_path)
        # self.updated = True
        return output_path
Example #3
0
class BERTTrainer:
    """
    BERTTrainer make the pretrained BERT model with two LM training method.

        1. Masked Language Model : 3.3.1 Task #1: Masked LM
        2. Next Sentence prediction : 3.3.2 Task #2: Next Sentence Prediction

    please check the details on README.md with simple example.

    """
    def __init__(self,
                 bert: BERT,
                 vocab_size: int,
                 train_dataloader: DataLoader,
                 test_dataloader: DataLoader = None,
                 lr: float = 1e-4,
                 betas=(0.9, 0.999),
                 weight_decay: float = 0.01,
                 warmup_steps=10000,
                 with_cuda: bool = True,
                 cuda_devices=None,
                 log_freq: int = 10):
        """
        :param bert: BERT model which you want to train
        :param vocab_size: total word vocab size
        :param train_dataloader: train dataset data loader
        :param test_dataloader: test dataset data loader [can be None]
        :param lr: learning rate of optimizer
        :param betas: Adam optimizer betas
        :param weight_decay: Adam optimizer weight decay param
        :param with_cuda: traning with cuda
        :param log_freq: logging frequency of the batch iteration
        """

        # Setup cuda device for BERT training, argument -c, --cuda should be true
        cuda_condition = torch.cuda.is_available() and with_cuda
        self.device = torch.device("cuda:0" if cuda_condition else "cpu")

        # This BERT model will be saved every epoch
        self.bert = bert
        # Initialize the BERT Language Model, with BERT model
        self.model = BERTLM(bert, vocab_size).to(self.device)

        # Distributed GPU training if CUDA can detect more than 1 GPU
        if with_cuda and torch.cuda.device_count() > 1:
            print("Using %d GPUS for BERT" % torch.cuda.device_count())
            self.model = nn.DataParallel(self.model, device_ids=cuda_devices)

        # Setting the train and test data loader
        self.train_data = train_dataloader
        self.test_data = test_dataloader

        # Setting the Adam optimizer with hyper-param
        self.optim = Adam(self.model.parameters(),
                          lr=lr,
                          betas=betas,
                          weight_decay=weight_decay)
        self.optim_schedule = ScheduledOptim(self.optim,
                                             self.bert.hidden,
                                             n_warmup_steps=warmup_steps)

        # Using Negative Log Likelihood Loss function for predicting the masked_token
        self.criterion = nn.NLLLoss(ignore_index=0)

        self.log_freq = log_freq

        print("Total Parameters:",
              sum([p.nelement() for p in self.model.parameters()]))

    def train(self, epoch):
        self.iteration(epoch, self.train_data)

    def test(self, epoch):
        self.iteration(epoch, self.test_data, train=False)

    def iteration(self, epoch, data_loader, train=True):
        """
        loop over the data_loader for training or testing
        if on train status, backward operation is activated
        and also auto save the model every peoch

        :param epoch: current epoch index
        :param data_loader: torch.utils.data.DataLoader for iteration
        :param train: boolean value of is train or test
        :return: None
        """
        str_code = "train" if train else "test"

        # Setting the tqdm progress bar
        data_iter = tqdm.tqdm(enumerate(data_loader),
                              desc="EP_%s:%d" % (str_code, epoch),
                              total=len(data_loader),
                              bar_format="{l_bar}{r_bar}")

        avg_loss = 0.0
        total_correct = 0
        total_close = 0
        total_element = 0

        for i, data in data_iter:
            # 0. batch_data will be sent into the device(GPU or cpu)
            data = {key: value.to(self.device) for key, value in data.items()}

            # 1. forward the next_sentence_prediction and masked_lm model
            next_sent_output, mask_lm_output = self.model.forward(
                data["bert_input"], data["segment_label"])

            # 2-1. NLL(negative log likelihood) loss of is_next classification result
            # next_loss = self.criterion(next_sent_output, data["is_next"])

            # 2-2. NLLLoss of predicting masked token word
            mask_loss = self.criterion(mask_lm_output.transpose(1, 2),
                                       data["bert_label"])

            # 2-3. Adding next_loss and mask_loss : 3.4 Pre-training Procedure
            loss = mask_loss

            # print(data)
            # input()

            # 3. backward and optimization only in train
            if train:
                self.optim_schedule.zero_grad()
                loss.backward()
                self.optim_schedule.step_and_update_lr()

            # next sentence prediction accuracy
            label_mask = torch.where(
                data["bert_label"] > 0,
                torch.ones(*data["bert_label"].shape).to(self.device),
                torch.zeros(*data["bert_label"].shape).to(self.device))
            correct = torch.mul(
                mask_lm_output.argmax(dim=2).eq(data["t1_raw"]),
                label_mask).sum().item()
            close = torch.mul(
                torch.mul(
                    mask_lm_output.argmax(dim=2).ge(data["t1_raw"] - 10),
                    mask_lm_output.argmax(dim=2).le(data["t1_raw"] + 10)),
                label_mask).sum().item()
            avg_loss += loss.item()
            total_correct += correct
            total_close += close
            total_element += label_mask.sum().item()
            # print(data['bert_label'], mask_lm_output.argmax(dim=2), correct)
            # input()

            post_fix = {
                "epoch": epoch,
                "iter": i,
                "avg_loss": avg_loss / (i + 1),
                "avg_acc": total_correct / total_element,
                "avg_clo": total_close / total_element,
                "loss": loss.item()
            }

            if i % self.log_freq == 0:
                data_iter.write(str(post_fix))

        print("EP%d_%s, avg_loss=" % (epoch, str_code),
              avg_loss / len(data_iter), "total_acc=",
              total_correct * 100.0 / total_element, "total_clo=",
              total_close * 100.0 / total_element)

    def save(self, epoch, file_path="output/bert_trained.model"):
        """
        Saving the current BERT model on file_path

        :param epoch: current epoch number
        :param file_path: model output path which gonna be file_path+"ep%d" % epoch
        :return: final_output_path
        """
        output_path = file_path + ".ep%d" % epoch
        torch.save(self.bert.cpu(), output_path)
        self.bert.to(self.device)
        print("EP:%d Model Saved on:" % epoch, output_path)
        return output_path
class BERTTrainer:
    """
    BERTTrainer make the pretrained BERT model with two LM training method.

        1. Masked Language Model : 3.3.1 Task #1: Masked LM
        2. Next Sentence prediction : 3.3.2 Task #2: Next Sentence Prediction

    please check the details on README.md with simple example.

    """

    def __init__(self, bert: BERT, vocab_size: int, seq_len: int,
                 train_dataloader: DataLoader,
                 lr: float = 1e-4, betas=(0.9, 0.999), weight_decay: float = 0.01, warmup_steps=10000,
                 with_cuda: bool = True, cuda_devices=None, log_freq: int = 100):
        """
        :param bert: BERT model which you want to train
        :param vocab_size: total word vocab size
        :param train_dataloader: train dataset data loader
        :param lr: learning rate of optimizer
        :param betas: Adam optimizer betas
        :param weight_decay: Adam optimizer weight decay param
        :param with_cuda: traning with cuda
        :param log_freq: logging frequency of the batch iteration
        """
        self.seq_len = seq_len

        # Setup cuda device for BERT training, argument -c, --cuda should be true
        cuda_condition = torch.cuda.is_available() and with_cuda
        self.device = torch.device("cuda:0" if cuda_condition else "cpu")

        # This BERT model will be saved every epoch
        self.bert = bert
        # Initialize the BERT Language Model, with BERT model
        self.model = BERTLM(bert, vocab_size).to(self.device)

        # Distributed GPU training if CUDA can detect more than 1 GPU
        if with_cuda and torch.cuda.device_count() > 1:
            print("Using %d GPUS for BERT" % torch.cuda.device_count())
            self.model = nn.DataParallel(self.model, device_ids=cuda_devices)

        # Setting the train data loader
        self.train_data = train_dataloader

        # Setting the Adam optimizer with hyper-param
        self.optim = Adam(self.model.parameters(), lr=lr, betas=betas, weight_decay=weight_decay)
        self.optim_schedule = ScheduledOptim(self.optim, self.bert.hidden, n_warmup_steps=warmup_steps)

        # Using Negative Log Likelihood Loss function for predicting the masked_token
        self.criterion = nn.MSELoss()

        self.log_freq = log_freq

        print("Total Parameters:", sum([p.nelement() for p in self.model.parameters()]))

    def train(self, epoch):
        self.iteration(epoch, self.train_data)

    def test(self, epoch):
        self.iteration(epoch, self.train_data, train=False)

    def iteration(self, epoch, data_loader, train=True):
        """
        loop over the data_loader for training or testing
        if on train status, backward operation is activated
        and also auto save the model every peoch

        :param epoch: current epoch index
        :param data_loader: torch.utils.data.DataLoader for iteration
        :param train: boolean value of is train or test
        :return: None
        """
        str_code = "train" if train else "test"

        # Setting the tqdm progress bar
        data_iter = tqdm(enumerate(data_loader),
                              desc="EP_%s:%d" % (str_code, epoch),
                              total=len(data_loader),
                              bar_format="{l_bar}{r_bar}")

        avg_loss = 0.0

        for i, data in data_iter:
            # 0. batch_data will be sent into the device(GPU or cpu)
            data = {key: value.to(self.device) for key, value in data.items()}

            # 1. forward the next_sentence_prediction and masked_lm model
            bert_output, original_emb = self.model.forward(data["bert_input"])
            
            # 2. MSELoss of predicting masked token word
            cos = nn.CosineSimilarity(dim=2, eps=1e-6)

            key = bert_output[:,0,:].unsqueeze(1).repeat(1, self.seq_len, 1)
            label = data["syn_label"].type(torch.FloatTensor).cuda()
            
            loss_1 = (torch.mul(((bert_output-original_emb)**2).mean(dim=2), torch.abs(label)).sum(dim=1)/torch.abs(label).sum(dim=1)).mean()
            loss_2 = ((torch.sub(target,torch.mul(cos(bert_output, key), label))).sum(dim=1)/torch.abs(label).sum(dim=1)).mean()
            
            loss = 0.7 * loss_1 + 0.3 * loss_2

            # 3. backward and optimization only in train
            if train:
                self.optim_schedule.zero_grad()
                loss.backward()
                self.optim_schedule.step_and_update_lr()
            
            else:
                with open('../output/embeddings/raw/result_input_iter{}.pkl'.format(i), "wb") as fb:
                    pickle.dump(data["bert_input"], fb)
                with open('../output/embeddings/raw/result_output_iter{}.pkl'.format(i), "wb") as fb:
                    pickle.dump(bert_output, fb)

            # next sentence prediction accuracy
            avg_loss += loss.item()

            post_fix = {
                "epoch": epoch,
                "iter": i,
                "avg_loss": avg_loss / (i + 1),
                "loss": loss.item()
            }

            if i % self.log_freq == 0:
                data_iter.write(str(post_fix))

        print("EP%d_%s, avg_loss=" % (epoch, str_code), avg_loss / len(data_iter))

    def save(self, epoch, file_path="output/bert_trained.model"):
        """
        Saving the current BERT model on file_path

        :param epoch: current epoch number
        :param file_path: model output path which gonna be file_path+"ep%d" % epoch
        :return: final_output_path
        """
        output_path = file_path + ".ep%d" % epoch
        torch.save(self.bert.cpu(), output_path)
        self.bert.to(self.device)
        print("EP:%d Model Saved on:" % epoch, output_path)
        return output_path
Example #5
0
def set_lr(optimizer, lr):
    for group in optimizer.param_groups:
        group['lr'] = lr


total_len = len(train_loader)

while True:
    decay_factor = 0.9**((epoch) // lr_decay_rate)
    current_lr = max(lr * decay_factor, 1e-4)
    set_lr(optimizer, current_lr)  # set the decayed rate
    for i, data in enumerate(train_loader, start=1):
        optimizer.zero_grad()
        batchsize = data["mask_input"].size(0)
        data = {key: value.cuda() for key, value in data.items()}
        bb1, frame1, hid1 = model.forward(
            data["mask_input"][:, :max_frames, :])
        bb2, frame2, hid2 = model.forward(data["mask_input"][:,
                                                             max_frames:, :])
        bb1 = torch.mean(bb1, 1)
        bb2 = torch.mean(bb2, 1)
        sim = bb1.mul(bb2)
        sim = torch.sum(sim, 1) / nbits

        nei_loss = torch.sum(
            (1 * data["is_similar"].float() - sim)**2) / batchsize
        mask_loss = (torch.sum((frame1-data["visual_word"][:,:max_frames,:])**2)\
                  +torch.sum((frame2-data["visual_word"][:,max_frames:,:])**2))\
                  /(2*max_frames*feature_size*batchsize)

        mu_loss = (torch.sum((torch.mean(hid1,1)-data['n1'])**2)\
            +torch.sum((torch.mean(hid2,1)-data['n2'])**2))/(hidden_size*batchsize)
Example #6
0
class BERTTrainer:
    def __init__(self, bert: BERT, vocab_size, train_dataloader, test_dataloader=None):
        self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

        self.bert = bert
        self.model = BERTLM(bert, vocab_size).to(self.device)

        if torch.cuda.device_count() > 1:
            print("Using %d GPUS for BERT" % torch.cuda.device_count())
            self.model = nn.DataParallel(self.model)

        self.train_data = train_dataloader
        self.test_data = test_dataloader

        self.optim = Adam(self.model.parameters(), lr=1e-4, betas=(0.9, 0.999), weight_decay=0.01)
        self.criterion = nn.NLLLoss(ignore_index=0)

    def train(self, epoch):
        self.iteration(epoch, self.train_data)

    def test(self, epoch):
        self.iteration(epoch, self.test_data, train=False)

    def iteration(self, epoch, data_loader, train=True):
        str_code = "train" if train else "test"
        data_iter = tqdm.tqdm(enumerate(data_loader),
                              desc="EP_%s:%d" % (str_code, epoch),
                              total=len(data_loader),
                              bar_format="{l_bar}{r_bar}")

        avg_loss = 0.0
        total_correct = 0
        total_element = 0

        for i, data in data_iter:
            data = {key: value.to(self.device) for key, value in data.items()}

            next_sent_output, mask_lm_output = self.model.forward(data["bert_input"], data["segment_label"])
            next_loss = self.criterion(next_sent_output, data["is_next"])
            mask_loss = self.criterion(mask_lm_output.transpose(1, 2), data["bert_label"])
            loss = next_loss + mask_loss

            correct = next_sent_output.argmax(dim=-1).eq(data["is_next"]).sum().item()

            avg_loss += loss.item()
            total_correct += correct
            total_element += data["is_next"].nelement()

            if train:
                self.optim.zero_grad()
                loss.backward()
                self.optim.step()

            post_fix = {
                "epoch": epoch,
                "avg_loss": avg_loss / (i + 1),
                "avg_acc": total_correct / total_element * 100
            }
            data_iter.set_postfix(post_fix)

        print("EP%d_%s, avg_loss=" % (epoch, str_code), avg_loss / len(data_iter), "total_acc=",
              total_correct * 100.0 / total_element)

    def save(self, output_dir, epoch, file_name="bert_trained_ep%d.model"):
        if isinstance(self.model, nn.DataParallel):
            model = self.model.module
        else:
            model = self.model

        with open(os.path.join(output_dir, file_name % epoch), "wb") as f:
            torch.save(model.cpu(), f)

        model.to(self.device)
Example #7
0
class BERTTrainer(BasicTrainer):
    def __init__(self, bert: BERT, vocab_size: int, epochs: int,
                 tensorboard_log_dir: str, output_path: str,
                 train_dataloader: DataLoader,
                 lr: float = 1e-4, betas=(0.9, 0.999), weight_decay: float = 0.01, warmup_steps=10000,
                 with_cuda: bool = True, log_freq: int = 10, save_steps: int = -1):

        super(BERTTrainer, self).__init__(bert=bert, epochs=epochs,
                                               tensorboard_log_dir=tensorboard_log_dir,
                                               output_path=output_path,
                                               train_dataloader=train_dataloader,
                                               with_cuda=with_cuda, log_freq=log_freq, save_steps=save_steps)

        self.model = BERTLM(bert, vocab_size).to(self.device)

        if with_cuda and torch.cuda.device_count() > 1:
            print("Using %d GPUS for BERT" % torch.cuda.device_count())
            self.model = nn.DataParallel(self.model)

        # Setting the Adam optimizer with hyper-param
        self.optim = Adam(self.model.parameters(), lr=lr, betas=betas, weight_decay=weight_decay)
        self.optim_schedule = ScheduledOptim(self.optim, self.bert.hidden, n_warmup_steps=warmup_steps)
        print("Total Parameters:", sum([p.nelement() for p in self.model.parameters()]))

    def iteration(self, epoch, data_loader):
        data_iter = tqdm.tqdm(enumerate(data_loader),
                              desc="EP:%d" % epoch,
                              total=self.n_batches,
                              bar_format="{l_bar}{r_bar}",
                              disable=False)

        avg_loss = 0.0
        for i, data in data_iter:
            global_step = epoch * self.n_batches + i + 1

            data = {key: value.to(self.device) for key, value in data.items()}

            mask_lm_output = self.model.forward(data["bert_input"])

            mask_loss = self.criterion(mask_lm_output.transpose(1, 2), data["bert_label"])

            loss = mask_loss

            self.optim_schedule.zero_grad()
            loss.backward()
            self.optim_schedule.step_and_update_lr()

            avg_loss += loss.item()

            self.tensorborad_writer.add_scalar("Masked_language_model loss", mask_loss.item(), global_step)
            self.tensorborad_writer.add_scalar("Average loss in epoch", avg_loss / (i + 1), global_step)

            post_fix = {
                "epoch": epoch,
                "iter": i+1,
                "avg_loss": avg_loss / (i + 1),
                "loss": loss.item()
            }

            if (i+1) % self.log_freq == 0:
                data_iter.write(str(post_fix))

            if self.save_steps > 0 and ((i + 1) % self.save_steps == 0 or (i + 1) == self.n_batches):
                self.save(epoch, i + 1)