class BERTTrainer: """ BERTTrainer make the pretrained BERT model with two LM training method. 1. Masked Language Model : 3.3.1 Task #1: Masked LM please check the details on README.md with simple example. """ def __init__(self, bert: BERT, vocab_size: int, model: BERTLM, train_dataloader: DataLoader, test_dataloader: DataLoader = None, with_cuda: bool = True, cuda_devices=None, log_freq: int = hp.log_freq, args=None, global_step=0, path=None): """ :param bert: MLM model which you want to train :param vocab_size: total word vocab size :param train_dataloader: train dataset data loader :param test_dataloader: test dataset data loader [can be None] :param lr: learning rate of optimizer :param betas: Adam optimizer betas :param weight_decay: Adam optimizer weight decay param :param with_cuda: traning with cuda :param log_freq: logging frequency of the batch iteration """ self.args = args self.step = global_step self.path = path # Setup cuda device for BERT training, argument -c, --cuda should be true cuda_condition = torch.cuda.is_available() and with_cuda self.device = torch.device("cuda:0,1,2,3" if cuda_condition else "cpu") # This BERT model will be saved every epoch self.bert = bert # Initialize the BERT Language Model, with BERT model if model is None: self.model = BERTLM(bert, vocab_size).to(self.device) else: self.model = model #self.model = BERTLM(bert, vocab_size).to(self.device) #self.model = torch.load('./output/model_mlm/mlm_ep2.model') #self.model = BertForSA(bert).to(self.device) # Distributed GPU training if CUDA can detect more than 1 GPU if with_cuda and torch.cuda.device_count() > 1: print("Using %d GPUS for BERT" % torch.cuda.device_count()) self.model = nn.DataParallel(self.model, device_ids=cuda_devices) # Setting the train and test data loader self.train_data = train_dataloader self.test_data = test_dataloader # Setting the Adam optimizer with hyper-param total_steps = hp.epochs * len(self.train_data) self.optimer = optim4GPU(self.model, total_steps) # Using Negative Log Likelihood Loss function for predicting the masked_token self.criterion = nn.NLLLoss(ignore_index=0) # Writer self.log_freq = log_freq # train self.train_loss_writer = SummaryWriter( f'{self.path.runs_path}/train/train_loss') self.train_attn_layer_writer = SummaryWriter( f'{self.path.runs_path}/train/attn_layer') self.train_model_param_writer = SummaryWriter( f'{self.path.runs_path}/train/model_param') # valid self.valid_loss_writer = SummaryWriter( f'{self.path.runs_path}/valid/valid_loss') self.valid_attn_layer_writer = SummaryWriter( f'{self.path.runs_path}/valid/valid_attn_layer') self.num_params() def train(self): train_writer = (self.train_loss_writer, self.train_attn_layer_writer, self.train_model_param_writer) valid_writer = (self.valid_loss_writer, self.valid_attn_layer_writer) try: for epoch in range(hp.epochs): # Setting the tqdm progress bar data_iter = tqdm.tqdm(enumerate(self.train_data), desc="EP_%s:%d" % ("train", epoch), total=len(self.train_data), bar_format="{l_bar}{r_bar}") running_loss = 0 for i, data in data_iter: self.step += 1 # 0. batch_data will be sent into the device(GPU or cpu) data = { key: value.to(self.device) for key, value in data.items() } # 1. forward masked_lm model mask_lm_output, attn_list = self.model.forward( data["mlm_input"], data["input_position"]) # 2. NLLLoss of predicting masked token word self.optimer.zero_grad() loss = self.criterion(mask_lm_output.transpose(1, 2), data["mlm_label"]) # 3. backward and optimization only in train loss.backward() self.optimer.step() # loss running_loss += loss.item() avg_loss = running_loss / (i + 1) # write log post_fix = { "epoch": epoch, "iter": i, "step": self.step, "avg_loss": avg_loss, "loss": loss.item() } if i % self.log_freq == 0: data_iter.write(str(post_fix)) # writer train loss if self.step % hp.save_train_loss == 0: self.train_loss_writer.add_scalar( 'train_loss', loss, self.step) # writer if self.step % hp.save_runs == 0 and data["mlm_input"].size( 0) == hp.batch_size: # 不足batch数量则不采样 # writer attns_layer for layer, prob in enumerate(attn_list): prob = prob[0] fig, axs = plt.subplots(1, 4, figsize=(20, 10)) print("Layer", layer + 1) for h in range(hp.attn_heads): # a = self.model.bert.layers[layer].multihead.attention[0][h].data self.draw(prob[h].cpu().detach().numpy(), [], [], ax=axs[h]) plt.savefig( f"{self.path.plt_train_attn_path}/Epoch{epoch}_train_step{self.step}_layer{layer+1}" ) # plt.show() # tensorboardX write for i, prob in enumerate(attn_list): # 第i层,每层画四个图 prob = prob[0] for j in range(hp.attn_heads): # 1,2,3,4 第j个 # print(f"j * self.args.batch_size - 1:{j * self.args.batch_size - 1}") x = vutils.make_grid( prob[j] * 255) # eg:如果是512,94,94 则取127,255,383,511 self.train_attn_layer_writer.add_image( f'Epoch{epoch}_train_attn_layer{i}_head{j + 1}', x, self.step) for module in self.model.modules( ): # param.clone().cpu().data.numpy() .module for name, param in module.named_parameters(): self.train_model_param_writer.add_histogram( f"Epoch{epoch}_train_{name}", param.clone().cpu().data.numpy(), self.step) # write model_param todo # for name, param in self.model.module.named_parameters(): # param.clone().cpu().data.numpy() .module # self.train_model_param_writer.add_histogram(f"Epoch{epoch}_train_{name}", param.clone().cpu().data.numpy(), self.step) # save model checkpoint if self.step % hp.save_checkpoint == 0: self.bert.checkpoint(self.path.bert_checkpoints_path, self.step) # save bert model if self.step % hp.save_model == 0: self.save_model(epoch, f"{self.path.bert_path}/bert") self.save_mlm_model(epoch, f"{self.path.mlm_path}/mlm") # evaluate if self.step % hp.save_valid_loss == 0: valid_loss = self.evaluate(epoch, valid_writer) valid_loss = self.evaluate(epoch, valid_writer) print( f"EP_{epoch}, train_avg_loss={avg_loss}, valid_avg_loss={valid_loss}" ) for writer in train_writer: writer.close() for writer in valid_writer: writer.close() except BaseException: traceback.print_exc() for writer in train_writer: writer.close() for writer in valid_writer: writer.close() def evaluate(self, epoch, valid_writer): (self.valid_loss_writer, self.valid_attn_layer_writer) = valid_writer self.model.eval() # Setting the tqdm progress bar data_iter = tqdm.tqdm(enumerate(self.test_data), desc="EP_%s:%d" % ("test", epoch), total=len(self.test_data), bar_format="{l_bar}{r_bar}") running_loss = 0 with torch.no_grad(): for i, data in data_iter: self.step += 1 # 0. batch_data will be sent into the device(GPU or cpu) data = { key: value.to(self.device) for key, value in data.items() } # 1. forward masked_lm model mask_lm_output, attn_list = self.model.forward( data["mlm_input"], data["input_position"]) # 2. NLLLoss of predicting masked token word loss = self.criterion(mask_lm_output.transpose(1, 2), data["mlm_label"]) # loss running_loss += loss.cpu().detach().numpy() avg_loss = running_loss / (i + 1) # print log post_fix = { "epoch": epoch, "iter": i, "step": self.step, "avg_loss": avg_loss, "loss": loss.item() } if i % self.log_freq == 0: data_iter.write(str(post_fix)) # writer valid loss self.valid_loss_writer.add_scalar('valid_loss', loss, self.step) if self.step % hp.save_runs == 0: # writer attns_layer for layer, prob in enumerate(attn_list): prob = prob[0] fig, axs = plt.subplots(1, 4, figsize=(20, 10)) print("Layer", layer + 1) for h in range(hp.attn_heads): # a = self.model.bert.layers[layer].multihead.attention[0][h].data self.draw(prob[h].cpu().detach().numpy(), [], [], ax=axs[h]) plt.savefig( f"{self.path.plt_train_attn_path}/Epoch{epoch}_valid_step{self.step}_layer{layer + 1}" ) # plt.show() # tensorboardX write for i, prob in enumerate(attn_list): # 第i层,每层画四个图 prob = prob[0] for j in range(hp.attn_heads): # 1,2,3,4 第j个 # print(f"j * self.args.batch_size - 1:{j * self.args.batch_size - 1}") x = vutils.make_grid( prob[j] * 255) # eg:如果是512,94,94 则取127,255,383,511 self.train_attn_layer_writer.add_image( f'Epoch{epoch}_valid_attn_layer{i}_head{j + 1}', x, self.step) print(f"Valid Over!") return avg_loss def evaluate_and_print(self, vocab): self.model.eval() # Setting the tqdm progress bar data_iter = tqdm.tqdm(enumerate(self.test_data), total=len(self.test_data), bar_format="{l_bar}{r_bar}") running_loss = 0 with torch.no_grad(): for i, data in data_iter: self.step += 1 # 0. batch_data will be sent into the device(GPU or cpu) data = { key: value.to(self.device) for key, value in data.items() } # 1. forward masked_lm model mask_lm_output, attn_list = self.model.forward( data["mlm_input"], data["input_position"]) for j, q in enumerate(mask_lm_output): #输出MLM input层 input = data["mlm_input"][j] char = '' for i, z in enumerate(input): if z.item() == 4: char += '[' + vocab.index2char( q[i].argmax().item()) + ']' else: char += vocab.index2char(z.item()) print(char) #label label = data["mlm_label"][j] char = '' for i, z in enumerate(label): char += vocab.index2char(z.item()) if z.item() != 0: _, topk = torch.topk(q[i], 10) softmax = nn.Softmax(0)(q[i]) info = vocab.index2char(z.item()) + ' ' + (str)( softmax[z.item()]) + ' ' for zz in topk: info += ' ' + vocab.index2char( zz.item()) + ' ' + (str)( softmax[zz.item()]) print(info) print(char) # 2. NLLLoss of predicting masked token word loss = self.criterion(mask_lm_output.transpose(1, 2), data["mlm_label"]) # loss running_loss += loss.cpu().detach().numpy() avg_loss = running_loss / (i + 1) # print log post_fix = { "iter": i, "step": self.step, "avg_loss": avg_loss, "loss": loss.item() } if i % self.log_freq == 0: data_iter.write(str(post_fix)) # writer valid loss self.valid_loss_writer.add_scalar('valid_loss', loss, self.step) if self.step % hp.save_runs == 0: # writer attns_layer for layer, prob in enumerate(attn_list): prob = prob[0] fig, axs = plt.subplots(1, 4, figsize=(20, 10)) print("Layer", layer + 1) for h in range(hp.attn_heads): # a = self.model.bert.layers[layer].multihead.attention[0][h].data self.draw(prob[h].cpu().detach().numpy(), [], [], ax=axs[h]) plt.savefig( f"{self.path.plt_train_attn_path}/Epoch{epoch}_valid_step{self.step}_layer{layer + 1}" ) # plt.show() print(f"Valid Over!") return avg_loss def eval(self): self.model.eval() data_iter = tqdm.tqdm(enumerate(self.test_data), total=len(self.test_data), bar_format="{l_bar}{r_bar}") results = [] with torch.no_grad(): for i, data in data_iter: self.step += 1 data = { key: value.to(self.device) for key, value in data.items() } logits = self.model.forward(data["mlm_input"], data["input_position"]) accuracy, result = self.calculate(logits, data["mlm_label"]) results.append(result) data_iter.set_description('Iter(acc=%5.3f)' % accuracy) total_accuracy = torch.cat(results).mean().item() print('Accuracy:', total_accuracy) def calculate(self, logits, label_id): _, label_pred = logits.max(1) result = (label_pred == label_id).float() # .cpu().numpy() accuracy = result.mean() return accuracy, result def stream(self, message): sys.stdout.write(f"\r{message}") def draw(self, data, x, y, ax): seaborn.heatmap( data, xticklabels=x, square=True, yticklabels=y, vmin=0.0, vmax=1.0, # 取值0-1 cbar=False, ax=ax) def num_params(self, print_out=True): params_requires_grad = filter(lambda p: p.requires_grad, self.model.parameters()) params_requires_grad = sum( [np.prod(p.size()) for p in params_requires_grad]) #/ 1_000_000 parameters = sum([np.prod(p.size()) for p in self.model.parameters()]) #/ 1_000_000 if print_out: print('Trainable total Parameters: %d' % parameters) print('Trainable requires_grad Parameters: %d' % params_requires_grad) def save_model(self, epoch, file_path="output/bert_trained.model"): """ Saving the current BERT model on file_path :param epoch: current epoch number :param file_path: model output path which gonna be file_path+"ep%d" % epoch :return: final_output_path """ output_path = file_path + "_ep%d.model" % epoch torch.save(self.bert.cpu(), output_path) self.bert.to(self.device) print("EP:%d Model Saved on:" % epoch, output_path) return output_path def save_mlm_model(self, epoch, file_path="output/mlm_trained.model"): """ Saving the current MLM model on file_path :param epoch: current epoch number :param file_path: model output path which gonna be file_path+"ep%d" % epoch :return: final_output_path """ output_path = file_path + "_ep%d.model" % epoch torch.save(self.model.cpu(), output_path) self.model.to(self.device) print("EP:%d Model Saved on:" % epoch, output_path) return output_path
class BERTTrainer: """ BERTTrainer make the pretrained BERT model with two LM training method. 1. Masked Language Model : 3.3.1 Task #1: Masked LM 2. Next Sentence prediction : 3.3.2 Task #2: Next Sentence Prediction please check the details on README.md with simple example. """ def __init__(self, bert: BERT, vocab_size: int, train_dataloader: DataLoader, test_dataloader: DataLoader = None, lr: float = 1e-4, betas=(0.9, 0.999), weight_decay: float = 0.01, warmup_steps=10000, with_cuda: bool = True, cuda_devices=None, log_freq: int = 10, pad_index=0): """ :param bert: BERT model which you want to train :param vocab_size: total word vocab size :param train_dataloader: train dataset data loader :param test_dataloader: test dataset data loader [can be None] :param lr: learning rate of optimizer :param betas: Adam optimizer betas :param weight_decay: Adam optimizer weight decay param :param with_cuda: traning with cuda :param log_freq: logging frequency of the batch iteration """ # Setup cuda device for BERT training, argument -c, --cuda should be true cuda_condition = torch.cuda.is_available() and with_cuda self.device = torch.device("cuda:0" if cuda_condition else "cpu") # This BERT model will be saved every epoch self.bert = bert # Initialize the BERT Language Model, with BERT model self.model = BERTLM(bert, vocab_size).to(self.device) # Distributed GPU training if CUDA can detect more than 1 GPU if with_cuda and torch.cuda.device_count() > 1: print("Using %d GPUS for BERT" % torch.cuda.device_count()) self.model = nn.DataParallel(self.model, device_ids=cuda_devices) # Setting the train and test data loader self.train_data = train_dataloader self.test_data = test_dataloader self.pad_index = pad_index # Setting the Adam optimizer with hyper-param # self.optim = Adam(self.model.parameters(), lr=lr, # betas=betas, weight_decay=weight_decay) # self.optim_schedule = ScheduledOptim( # self.optim, self.bert.hidden, n_warmup_steps=warmup_steps) self.optim = SGD(self.model.parameters(), lr=lr, momentum=0.9) # Using Negative Log Likelihood Loss function for predicting the masked_token self.criterion = nn.NLLLoss(ignore_index=self.pad_index) self.log_freq = log_freq print("Total Parameters:", sum([p.nelement() for p in self.model.parameters()])) def train(self, epoch): self.model.train() return self.iteration(epoch, self.train_data) def test(self, epoch): self.model.eval() return self.iteration(epoch, self.test_data, train=False) def iteration(self, epoch, data_loader, train=True): """ loop over the data_loader for training or testing if on train status, backward operation is activated and also auto save the model every peoch :param epoch: current epoch index :param data_loader: torch.utils.data.DataLoader for iteration :param train: boolean value of is train or test :return: None """ # pdb.set_trace() str_code = "train" if train else "test" # Setting the tqdm progress bar data_iter = tqdm.tqdm(enumerate(data_loader), desc="EP_%s:%d" % (str_code, epoch), total=len(data_loader), bar_format="{l_bar}{r_bar}") avg_loss = 0.0 total_correct = 0 total_element = 0 def calculate_iter(data): next_sent_output, mask_lm_output = self.model.forward( data["bert_input"], data["segment_label"], data["adj_mat"], train) mask_loss = self.criterion(mask_lm_output.transpose(1, 2), data["bert_label"]) loss = mask_loss return loss for i, data in data_iter: # 0. batch_data will be sent into the device(GPU or cpu) # pdb.set_trace() data = data[0] data = {key: value.to(self.device) for key, value in data.items()} if train: loss = calculate_iter(data) else: with torch.no_grad(): loss = calculate_iter(data) # 1. forward the next_sentence_prediction and masked_lm model # next_sent_output, mask_lm_output = self.model.forward( # data["bert_input"], data["segment_label"], data["adj_mat"], train) # # pdb.set_trace() # # 2-1. NLL(negative log likelihood) loss of is_next classification result # # next_loss = self.criterion(next_sent_output, data["is_next"]) # # 2-2. NLLLoss of predicting masked token word # mask_loss = self.criterion( # mask_lm_output.transpose(1, 2), data["bert_label"]) # # pdb.set_trace() # # 2-3. Adding next_loss and mask_loss : 3.4 Pre-training Procedure # # loss = next_loss + mask_loss # loss = mask_loss # 3. backward and optimization only in train if train: self.optim.zero_grad() loss.backward() # self.optim.step_and_update_lr() self.optim.step() # pdb.set_trace() # mlm prediction accuracy # correct = next_sent_output.argmax( # dim=-1).eq(data["is_next"]).sum().item() correct = 0 elements = 0 for labels, t_labels in zip(mask_lm_output.argmax(dim=-1), data["bert_label"]): correct += sum([ 1 if l == t and t != self.pad_index else 0 for l, t in zip(labels, t_labels) ]) elements += sum([1 for t in t_labels if t != self.pad_index]) # next sentence prediction accuracy # correct = next_sent_output.argmax( # dim=-1).eq(data["is_next"]).sum().item() avg_loss += loss.item() total_correct += correct # total_element += data["is_next"].nelement() total_element += elements post_fix = { "epoch": epoch, "iter": i, "avg_loss": avg_loss / (i + 1), "avg_acc": total_correct / total_element * 100, "loss": loss.item() } if i % self.log_freq == 0 and i != 0: data_iter.write(str(post_fix)) print("EP%d_%s, avg_loss=" % (epoch, str_code), avg_loss / len(data_iter), "total_acc=", total_correct * 100.0 / total_element) return avg_loss / len(data_iter) def save(self, epoch, file_path="output/bert_trained.model"): """ Saving the current BERT model on file_path :param epoch: current epoch number :param file_path: model output path which gonna be file_path+"ep%d" % epoch :return: final_output_path """ # output_path = file_path + ".ep%d" % epoch # torch.save(self.bert.cpu(), output_path) # self.bert.to(self.device) # print("EP:%d Model Saved on:" % epoch, output_path) # return output_path output_path = file_path # + ".ep%d" % epoch # if self.updated: # return output_path # torch.save(self.bert.cpu(), output_path) torch.save( { 'epoch': epoch, 'model_state_dict': self.model.state_dict() # 'optimizer_state_dict': optimizer.state_dict(), # 'loss': loss, # ... }, output_path) # self.bert.to(self.device) print("EP:%d Model Saved on:" % epoch, output_path) # self.updated = True return output_path
class BERTTrainer: """ BERTTrainer make the pretrained BERT model with two LM training method. 1. Masked Language Model : 3.3.1 Task #1: Masked LM 2. Next Sentence prediction : 3.3.2 Task #2: Next Sentence Prediction please check the details on README.md with simple example. """ def __init__(self, bert: BERT, vocab_size: int, train_dataloader: DataLoader, test_dataloader: DataLoader = None, lr: float = 1e-4, betas=(0.9, 0.999), weight_decay: float = 0.01, warmup_steps=10000, with_cuda: bool = True, cuda_devices=None, log_freq: int = 10): """ :param bert: BERT model which you want to train :param vocab_size: total word vocab size :param train_dataloader: train dataset data loader :param test_dataloader: test dataset data loader [can be None] :param lr: learning rate of optimizer :param betas: Adam optimizer betas :param weight_decay: Adam optimizer weight decay param :param with_cuda: traning with cuda :param log_freq: logging frequency of the batch iteration """ # Setup cuda device for BERT training, argument -c, --cuda should be true cuda_condition = torch.cuda.is_available() and with_cuda self.device = torch.device("cuda:0" if cuda_condition else "cpu") # This BERT model will be saved every epoch self.bert = bert # Initialize the BERT Language Model, with BERT model self.model = BERTLM(bert, vocab_size).to(self.device) # Distributed GPU training if CUDA can detect more than 1 GPU if with_cuda and torch.cuda.device_count() > 1: print("Using %d GPUS for BERT" % torch.cuda.device_count()) self.model = nn.DataParallel(self.model, device_ids=cuda_devices) # Setting the train and test data loader self.train_data = train_dataloader self.test_data = test_dataloader # Setting the Adam optimizer with hyper-param self.optim = Adam(self.model.parameters(), lr=lr, betas=betas, weight_decay=weight_decay) self.optim_schedule = ScheduledOptim(self.optim, self.bert.hidden, n_warmup_steps=warmup_steps) # Using Negative Log Likelihood Loss function for predicting the masked_token self.criterion = nn.NLLLoss(ignore_index=0) self.log_freq = log_freq print("Total Parameters:", sum([p.nelement() for p in self.model.parameters()])) def train(self, epoch): self.iteration(epoch, self.train_data) def test(self, epoch): self.iteration(epoch, self.test_data, train=False) def iteration(self, epoch, data_loader, train=True): """ loop over the data_loader for training or testing if on train status, backward operation is activated and also auto save the model every peoch :param epoch: current epoch index :param data_loader: torch.utils.data.DataLoader for iteration :param train: boolean value of is train or test :return: None """ str_code = "train" if train else "test" # Setting the tqdm progress bar data_iter = tqdm.tqdm(enumerate(data_loader), desc="EP_%s:%d" % (str_code, epoch), total=len(data_loader), bar_format="{l_bar}{r_bar}") avg_loss = 0.0 total_correct = 0 total_close = 0 total_element = 0 for i, data in data_iter: # 0. batch_data will be sent into the device(GPU or cpu) data = {key: value.to(self.device) for key, value in data.items()} # 1. forward the next_sentence_prediction and masked_lm model next_sent_output, mask_lm_output = self.model.forward( data["bert_input"], data["segment_label"]) # 2-1. NLL(negative log likelihood) loss of is_next classification result # next_loss = self.criterion(next_sent_output, data["is_next"]) # 2-2. NLLLoss of predicting masked token word mask_loss = self.criterion(mask_lm_output.transpose(1, 2), data["bert_label"]) # 2-3. Adding next_loss and mask_loss : 3.4 Pre-training Procedure loss = mask_loss # print(data) # input() # 3. backward and optimization only in train if train: self.optim_schedule.zero_grad() loss.backward() self.optim_schedule.step_and_update_lr() # next sentence prediction accuracy label_mask = torch.where( data["bert_label"] > 0, torch.ones(*data["bert_label"].shape).to(self.device), torch.zeros(*data["bert_label"].shape).to(self.device)) correct = torch.mul( mask_lm_output.argmax(dim=2).eq(data["t1_raw"]), label_mask).sum().item() close = torch.mul( torch.mul( mask_lm_output.argmax(dim=2).ge(data["t1_raw"] - 10), mask_lm_output.argmax(dim=2).le(data["t1_raw"] + 10)), label_mask).sum().item() avg_loss += loss.item() total_correct += correct total_close += close total_element += label_mask.sum().item() # print(data['bert_label'], mask_lm_output.argmax(dim=2), correct) # input() post_fix = { "epoch": epoch, "iter": i, "avg_loss": avg_loss / (i + 1), "avg_acc": total_correct / total_element, "avg_clo": total_close / total_element, "loss": loss.item() } if i % self.log_freq == 0: data_iter.write(str(post_fix)) print("EP%d_%s, avg_loss=" % (epoch, str_code), avg_loss / len(data_iter), "total_acc=", total_correct * 100.0 / total_element, "total_clo=", total_close * 100.0 / total_element) def save(self, epoch, file_path="output/bert_trained.model"): """ Saving the current BERT model on file_path :param epoch: current epoch number :param file_path: model output path which gonna be file_path+"ep%d" % epoch :return: final_output_path """ output_path = file_path + ".ep%d" % epoch torch.save(self.bert.cpu(), output_path) self.bert.to(self.device) print("EP:%d Model Saved on:" % epoch, output_path) return output_path
class BERTTrainer: """ BERTTrainer make the pretrained BERT model with two LM training method. 1. Masked Language Model : 3.3.1 Task #1: Masked LM 2. Next Sentence prediction : 3.3.2 Task #2: Next Sentence Prediction please check the details on README.md with simple example. """ def __init__(self, bert: BERT, vocab_size: int, seq_len: int, train_dataloader: DataLoader, lr: float = 1e-4, betas=(0.9, 0.999), weight_decay: float = 0.01, warmup_steps=10000, with_cuda: bool = True, cuda_devices=None, log_freq: int = 100): """ :param bert: BERT model which you want to train :param vocab_size: total word vocab size :param train_dataloader: train dataset data loader :param lr: learning rate of optimizer :param betas: Adam optimizer betas :param weight_decay: Adam optimizer weight decay param :param with_cuda: traning with cuda :param log_freq: logging frequency of the batch iteration """ self.seq_len = seq_len # Setup cuda device for BERT training, argument -c, --cuda should be true cuda_condition = torch.cuda.is_available() and with_cuda self.device = torch.device("cuda:0" if cuda_condition else "cpu") # This BERT model will be saved every epoch self.bert = bert # Initialize the BERT Language Model, with BERT model self.model = BERTLM(bert, vocab_size).to(self.device) # Distributed GPU training if CUDA can detect more than 1 GPU if with_cuda and torch.cuda.device_count() > 1: print("Using %d GPUS for BERT" % torch.cuda.device_count()) self.model = nn.DataParallel(self.model, device_ids=cuda_devices) # Setting the train data loader self.train_data = train_dataloader # Setting the Adam optimizer with hyper-param self.optim = Adam(self.model.parameters(), lr=lr, betas=betas, weight_decay=weight_decay) self.optim_schedule = ScheduledOptim(self.optim, self.bert.hidden, n_warmup_steps=warmup_steps) # Using Negative Log Likelihood Loss function for predicting the masked_token self.criterion = nn.MSELoss() self.log_freq = log_freq print("Total Parameters:", sum([p.nelement() for p in self.model.parameters()])) def train(self, epoch): self.iteration(epoch, self.train_data) def test(self, epoch): self.iteration(epoch, self.train_data, train=False) def iteration(self, epoch, data_loader, train=True): """ loop over the data_loader for training or testing if on train status, backward operation is activated and also auto save the model every peoch :param epoch: current epoch index :param data_loader: torch.utils.data.DataLoader for iteration :param train: boolean value of is train or test :return: None """ str_code = "train" if train else "test" # Setting the tqdm progress bar data_iter = tqdm(enumerate(data_loader), desc="EP_%s:%d" % (str_code, epoch), total=len(data_loader), bar_format="{l_bar}{r_bar}") avg_loss = 0.0 for i, data in data_iter: # 0. batch_data will be sent into the device(GPU or cpu) data = {key: value.to(self.device) for key, value in data.items()} # 1. forward the next_sentence_prediction and masked_lm model bert_output, original_emb = self.model.forward(data["bert_input"]) # 2. MSELoss of predicting masked token word cos = nn.CosineSimilarity(dim=2, eps=1e-6) key = bert_output[:,0,:].unsqueeze(1).repeat(1, self.seq_len, 1) label = data["syn_label"].type(torch.FloatTensor).cuda() loss_1 = (torch.mul(((bert_output-original_emb)**2).mean(dim=2), torch.abs(label)).sum(dim=1)/torch.abs(label).sum(dim=1)).mean() loss_2 = ((torch.sub(target,torch.mul(cos(bert_output, key), label))).sum(dim=1)/torch.abs(label).sum(dim=1)).mean() loss = 0.7 * loss_1 + 0.3 * loss_2 # 3. backward and optimization only in train if train: self.optim_schedule.zero_grad() loss.backward() self.optim_schedule.step_and_update_lr() else: with open('../output/embeddings/raw/result_input_iter{}.pkl'.format(i), "wb") as fb: pickle.dump(data["bert_input"], fb) with open('../output/embeddings/raw/result_output_iter{}.pkl'.format(i), "wb") as fb: pickle.dump(bert_output, fb) # next sentence prediction accuracy avg_loss += loss.item() post_fix = { "epoch": epoch, "iter": i, "avg_loss": avg_loss / (i + 1), "loss": loss.item() } if i % self.log_freq == 0: data_iter.write(str(post_fix)) print("EP%d_%s, avg_loss=" % (epoch, str_code), avg_loss / len(data_iter)) def save(self, epoch, file_path="output/bert_trained.model"): """ Saving the current BERT model on file_path :param epoch: current epoch number :param file_path: model output path which gonna be file_path+"ep%d" % epoch :return: final_output_path """ output_path = file_path + ".ep%d" % epoch torch.save(self.bert.cpu(), output_path) self.bert.to(self.device) print("EP:%d Model Saved on:" % epoch, output_path) return output_path
def set_lr(optimizer, lr): for group in optimizer.param_groups: group['lr'] = lr total_len = len(train_loader) while True: decay_factor = 0.9**((epoch) // lr_decay_rate) current_lr = max(lr * decay_factor, 1e-4) set_lr(optimizer, current_lr) # set the decayed rate for i, data in enumerate(train_loader, start=1): optimizer.zero_grad() batchsize = data["mask_input"].size(0) data = {key: value.cuda() for key, value in data.items()} bb1, frame1, hid1 = model.forward( data["mask_input"][:, :max_frames, :]) bb2, frame2, hid2 = model.forward(data["mask_input"][:, max_frames:, :]) bb1 = torch.mean(bb1, 1) bb2 = torch.mean(bb2, 1) sim = bb1.mul(bb2) sim = torch.sum(sim, 1) / nbits nei_loss = torch.sum( (1 * data["is_similar"].float() - sim)**2) / batchsize mask_loss = (torch.sum((frame1-data["visual_word"][:,:max_frames,:])**2)\ +torch.sum((frame2-data["visual_word"][:,max_frames:,:])**2))\ /(2*max_frames*feature_size*batchsize) mu_loss = (torch.sum((torch.mean(hid1,1)-data['n1'])**2)\ +torch.sum((torch.mean(hid2,1)-data['n2'])**2))/(hidden_size*batchsize)
class BERTTrainer: def __init__(self, bert: BERT, vocab_size, train_dataloader, test_dataloader=None): self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") self.bert = bert self.model = BERTLM(bert, vocab_size).to(self.device) if torch.cuda.device_count() > 1: print("Using %d GPUS for BERT" % torch.cuda.device_count()) self.model = nn.DataParallel(self.model) self.train_data = train_dataloader self.test_data = test_dataloader self.optim = Adam(self.model.parameters(), lr=1e-4, betas=(0.9, 0.999), weight_decay=0.01) self.criterion = nn.NLLLoss(ignore_index=0) def train(self, epoch): self.iteration(epoch, self.train_data) def test(self, epoch): self.iteration(epoch, self.test_data, train=False) def iteration(self, epoch, data_loader, train=True): str_code = "train" if train else "test" data_iter = tqdm.tqdm(enumerate(data_loader), desc="EP_%s:%d" % (str_code, epoch), total=len(data_loader), bar_format="{l_bar}{r_bar}") avg_loss = 0.0 total_correct = 0 total_element = 0 for i, data in data_iter: data = {key: value.to(self.device) for key, value in data.items()} next_sent_output, mask_lm_output = self.model.forward(data["bert_input"], data["segment_label"]) next_loss = self.criterion(next_sent_output, data["is_next"]) mask_loss = self.criterion(mask_lm_output.transpose(1, 2), data["bert_label"]) loss = next_loss + mask_loss correct = next_sent_output.argmax(dim=-1).eq(data["is_next"]).sum().item() avg_loss += loss.item() total_correct += correct total_element += data["is_next"].nelement() if train: self.optim.zero_grad() loss.backward() self.optim.step() post_fix = { "epoch": epoch, "avg_loss": avg_loss / (i + 1), "avg_acc": total_correct / total_element * 100 } data_iter.set_postfix(post_fix) print("EP%d_%s, avg_loss=" % (epoch, str_code), avg_loss / len(data_iter), "total_acc=", total_correct * 100.0 / total_element) def save(self, output_dir, epoch, file_name="bert_trained_ep%d.model"): if isinstance(self.model, nn.DataParallel): model = self.model.module else: model = self.model with open(os.path.join(output_dir, file_name % epoch), "wb") as f: torch.save(model.cpu(), f) model.to(self.device)
class BERTTrainer(BasicTrainer): def __init__(self, bert: BERT, vocab_size: int, epochs: int, tensorboard_log_dir: str, output_path: str, train_dataloader: DataLoader, lr: float = 1e-4, betas=(0.9, 0.999), weight_decay: float = 0.01, warmup_steps=10000, with_cuda: bool = True, log_freq: int = 10, save_steps: int = -1): super(BERTTrainer, self).__init__(bert=bert, epochs=epochs, tensorboard_log_dir=tensorboard_log_dir, output_path=output_path, train_dataloader=train_dataloader, with_cuda=with_cuda, log_freq=log_freq, save_steps=save_steps) self.model = BERTLM(bert, vocab_size).to(self.device) if with_cuda and torch.cuda.device_count() > 1: print("Using %d GPUS for BERT" % torch.cuda.device_count()) self.model = nn.DataParallel(self.model) # Setting the Adam optimizer with hyper-param self.optim = Adam(self.model.parameters(), lr=lr, betas=betas, weight_decay=weight_decay) self.optim_schedule = ScheduledOptim(self.optim, self.bert.hidden, n_warmup_steps=warmup_steps) print("Total Parameters:", sum([p.nelement() for p in self.model.parameters()])) def iteration(self, epoch, data_loader): data_iter = tqdm.tqdm(enumerate(data_loader), desc="EP:%d" % epoch, total=self.n_batches, bar_format="{l_bar}{r_bar}", disable=False) avg_loss = 0.0 for i, data in data_iter: global_step = epoch * self.n_batches + i + 1 data = {key: value.to(self.device) for key, value in data.items()} mask_lm_output = self.model.forward(data["bert_input"]) mask_loss = self.criterion(mask_lm_output.transpose(1, 2), data["bert_label"]) loss = mask_loss self.optim_schedule.zero_grad() loss.backward() self.optim_schedule.step_and_update_lr() avg_loss += loss.item() self.tensorborad_writer.add_scalar("Masked_language_model loss", mask_loss.item(), global_step) self.tensorborad_writer.add_scalar("Average loss in epoch", avg_loss / (i + 1), global_step) post_fix = { "epoch": epoch, "iter": i+1, "avg_loss": avg_loss / (i + 1), "loss": loss.item() } if (i+1) % self.log_freq == 0: data_iter.write(str(post_fix)) if self.save_steps > 0 and ((i + 1) % self.save_steps == 0 or (i + 1) == self.n_batches): self.save(epoch, i + 1)