def fit(model, training_iter, eval_iter, num_epoch, pbar, lr_decay_mode, initial_lr, verbose=1): model.apply(weights_init) if use_cuda: model.cuda() optimizer = torch.optim.Adam(model.parameters(), lr=initial_lr) loss_fn = nn.CrossEntropyLoss() train_losses = [] eval_losses = [] train_accuracy = [] eval_accuracy = [] history = { "train_loss": train_losses, "train_acc": train_accuracy, "eval_loss": eval_losses, "eval_acc": eval_accuracy } start = time.time() for e in range(num_epoch): if e > 0: lr_update(optimizer=optimizer, epoch=e, lr_decay_mode=lr_decay_mode) model.train() for index, (inputs, label, length) in enumerate(training_iter): if config.use_mem_track: gpu_tracker.track() if use_cuda: inputs = Variable(inputs.cuda()) label = Variable(label.squeeze(1).cuda()) length = Variable(length.cuda()) y_preds = model(inputs, length) train_loss = loss_fn(y_preds, label) optimizer.zero_grad() train_loss.backward() optimizer.step() train_acc, _ = model.evaluate(y_preds, label) pbar.show_process(train_acc, train_loss.data, time.time()-start, index) if config.use_mem_track: gpu_tracker.track() if use_cuda: torch.cuda.empty_cache() model.eval() eval_loss, eval_acc, eval_f1 = 0, 0, 0 with torch.no_grad(): count = 0 for eval_inputs, eval_label, eval_length in eval_iter: if use_cuda: eval_inputs, eval_label, length = eval_inputs.cuda(), eval_label.squeeze(1).cuda(), eval_length.cuda() y_preds = model(eval_inputs, eval_length) eval_loss += loss_fn(y_preds, eval_label).data eval_accur, eval_f1_score = model.evaluate(y_preds, eval_label) eval_acc += eval_accur eval_f1 += eval_f1_score count += 1 logger.info( '\n\nEpoch %d - train_loss: %4f - eval_loss: %4f - train_acc:%4f - eval_acc:%4f - eval_f1:%4f\n' % (e + 1, train_loss.data, eval_loss/count, train_acc, eval_acc/count, eval_f1/count)) if e % verbose == 0: train_losses.append(train_loss.data) train_accuracy.append(train_acc) eval_losses.append(eval_loss/count) eval_accuracy.append(eval_acc/count) model.save() loss_acc_plot(history)
def fit(model, training_iter, eval_iter, num_epoch, pbar, num_train_steps, verbose=1): # ------------------判断CUDA模式---------------------- if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() # 多GPU # n_gpu = 1 else: torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) n_gpu = 1 logger.info( "device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}". format(device, n_gpu, bool(args.local_rank != -1), args.fp16)) # ---------------------优化器------------------------- param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] t_total = num_train_steps ## ---------------------GPU半精度fp16----------------------------- if args.fp16: try: from apex.optimizers import FP16_Optimizer from apex.optimizers import FusedAdam except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) optimizer = FusedAdam(optimizer_grouped_parameters, lr=args.learning_rate, bias_correction=False, max_grad_norm=1.0) if args.loss_scale == 0: optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True) else: optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale) ## ------------------------GPU单精度fp32--------------------------- else: optimizer = BertAdam(optimizer_grouped_parameters, lr=args.learning_rate, warmup=args.warmup_proportion, t_total=t_total) # ---------------------模型初始化---------------------- if args.fp16: model.half() model.to(device) if args.local_rank != -1: try: from apex.parallel import DistributedDataParallel as DDP except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) model = DDP(model) elif n_gpu > 1: model = torch.nn.DataParallel(model, device_ids=[0, 1]) train_losses = [] eval_losses = [] train_accuracy = [] eval_accuracy = [] history = { "train_loss": train_losses, "train_acc": train_accuracy, "eval_loss": eval_losses, "eval_acc": eval_accuracy } # ------------------------训练------------------------------ best_f1 = 0 start = time.time() global_step = 0 for e in range(num_epoch): model.train() for step, batch in enumerate(training_iter): batch = tuple(t.to(device) for t in batch) input_ids, input_mask, segment_ids, start_positions, end_positions, answer_types = batch start_logits, end_logits, answer_type_logits = model( input_ids, segment_ids, input_mask) train_loss = loss_fn(start_logits, end_logits, answer_type_logits, start_positions, end_positions, answer_types) if args.gradient_accumulation_steps > 1: train_loss = train_loss / args.gradient_accumulation_steps if args.fp16: optimizer.backward(train_loss) else: train_loss.backward() if (step + 1) % args.gradient_accumulation_steps == 0: # modify learning rate with special warm up BERT uses lr_this_step = args.learning_rate * warmup_linear( global_step / t_total, args.warmup_proportion) for param_group in optimizer.param_groups: param_group['lr'] = lr_this_step optimizer.step() optimizer.zero_grad() global_step += 1 start_logits, end_logits = start_logits.cpu(), end_logits.cpu() start_positions, end_positions = start_positions.cpu( ), end_positions.cpu() train_acc, f1 = qa_evaluate((start_logits, end_logits), (start_positions, end_positions)) pbar.show_process(train_acc, train_loss.item(), f1, time.time() - start, step) # -----------------------验证---------------------------- model.eval() count = 0 y_predicts, y_labels = [], [] eval_starts_predict, eval_ends_predict = [], [] eval_starts_label, eval_ends_label = [], [] eval_loss, eval_acc, eval_f1 = 0, 0, 0 with torch.no_grad(): for step, batch in enumerate(eval_iter): batch = tuple(t.to(device) for t in batch) input_ids, input_mask, segment_ids, start_positions, end_positions, answer_types = batch start_logits, end_logits, answer_type_logits = model( input_ids, segment_ids, input_mask) eval_los = loss_fn(start_logits, end_logits, answer_type_logits, start_positions, end_positions, answer_types) eval_loss = eval_los + eval_loss count += 1 eval_starts_predict.append(start_logits) eval_ends_predict.append(end_logits) eval_starts_label.append(start_positions) eval_ends_label.append(end_positions) eval_starts_predicted = torch.cat(eval_starts_predict, dim=0).cpu() eval_ends_predicted = torch.cat(eval_ends_predict, dim=0).cpu() eval_starts_labeled = torch.cat(eval_starts_label, dim=0).cpu() eval_ends_labeled = torch.cat(eval_ends_label, dim=0).cpu() eval_predicted = (eval_starts_predicted, eval_ends_predicted) eval_labeled = (eval_starts_labeled, eval_ends_labeled) eval_acc, eval_f1 = qa_evaluate(eval_predicted, eval_labeled) logger.info( '\n\nEpoch %d - train_loss: %4f - eval_loss: %4f - train_acc:%4f - eval_acc:%4f - eval_f1:%4f\n' % (e + 1, train_loss.item(), eval_loss.item() / count, train_acc, eval_acc, eval_f1)) # 保存最好的模型 if eval_f1 > best_f1: best_f1 = eval_f1 save_model(model, args.output_dir) if e % verbose == 0: train_losses.append(train_loss.item()) train_accuracy.append(train_acc) eval_losses.append(eval_loss.item() / count) eval_accuracy.append(eval_acc) loss_acc_plot(history)
def fit(model, training_iter, eval_iter, num_epoch, pbar, lr_decay_mode, initial_lr, verbose=1): model.apply(weights_init) if use_cuda: model.cuda() optimizer = torch.optim.Adam(model.parameters(), lr=initial_lr) train_losses = [] eval_losses = [] train_accuracy = [] eval_accuracy = [] history = { "train_loss": train_losses, "train_acc": train_accuracy, "eval_loss": eval_losses, "eval_acc": eval_accuracy } best_f1 = 0 start = time.time() for e in range(num_epoch): if e > 0: lr_update(optimizer=optimizer, epoch=e, lr_decay_mode=lr_decay_mode) model.train() for index, (inputs, label, length) in enumerate(training_iter): if config.use_mem_track: gpu_tracker.track() if use_cuda: inputs = Variable(inputs.cuda()) label = Variable(label.cuda()) length = Variable(length.cuda()) output = model(inputs, length) train_loss = model.loss_fn(output, label, length) optimizer.zero_grad() train_loss.backward() optimizer.step() with torch.no_grad(): predicts = model.predict(output, length) predicts = predicts.view(1, -1).squeeze() predicts = predicts[predicts != -1] label = label.view(1, -1).squeeze() label = label[label != -1] train_acc, _ = model.evaluate(predicts, label) pbar.show_process(train_acc, train_loss.detach(), time.time() - start, index) if config.use_mem_track: gpu_tracker.track() if use_cuda: torch.cuda.empty_cache() model.eval() eval_loss, eval_acc, eval_f1 = 0, 0, 0 with torch.no_grad(): predict_set, label_set = [], [] count = 0 for eval_inputs, eval_label, eval_length in eval_iter: if use_cuda: eval_inputs, eval_label, eval_length = eval_inputs.cuda( ), eval_label.cuda(), eval_length.cuda() output = model(eval_inputs, eval_length) eval_loss += model.loss_fn(output, eval_label, eval_length).detach() eval_predicts = model.predict(output, eval_length) eval_predicts = eval_predicts.view(1, -1).squeeze() eval_predicts = eval_predicts[eval_predicts != -1] predict_set.append(eval_predicts) eval_label = eval_label.view(1, -1).squeeze() eval_label = eval_label[eval_label != -1] label_set.append(eval_label) count += 1 predict_set = torch.cat(predict_set, dim=0) label_set = torch.cat(label_set, dim=0) eval_acc, eval_f1 = model.evaluate(predict_set, label_set) model.class_report(predict_set, label_set) logger.info( '\n\nEpoch %d - train_loss: %4f - eval_loss: %4f - train_acc:%4f - eval_acc:%4f - eval_f1:%4f\n' % (e + 1, train_loss.detach(), eval_loss / count, train_acc, eval_acc, eval_f1)) # 保存最好的模型 if eval_f1 > best_f1: best_f1 = eval_f1 model.save() if e % verbose == 0: train_losses.append(train_loss.data) train_accuracy.append(train_acc) eval_losses.append(eval_loss / count) eval_accuracy.append(eval_acc / count) model.save() loss_acc_plot(history)
def fit(self, training_data, eval_data, pbar, num_epochs=100, early_stopping_rounds=5, verbose=1, train_from_scratch=True): """train the model""" if train_from_scratch is False: self.restore_model() # Initialize best loss. This variable will store the lowest loss on the # eval dataset. best_loss = 2018 # Initialize classes to update the mean loss of train and eval train_loss = [] eval_loss = [] train_accuracy = [] eval_accuracy = [] # Initialize dictionary to store the loss history self.history['train_loss'] = [] self.history['eval_loss'] = [] self.history['train_accuracy'] = [] self.history['eval_accuracy'] = [] count = early_stopping_rounds # Begin training for i in range(num_epochs): # 在每个epoch训练之初初始化optimizer,决定是否使用学习率衰减 learning_rate = lr_update(i + 1, mode=config.lr_mode) optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate) # Training with gradient descent start = time.time() for index, (sequence, label, _) in enumerate(training_data): # cpu需要类型转换,不然会报错:Could not find valid device sequence = tf.cast(sequence, dtype=tf.float32) label = tf.cast(label, dtype=tf.int64) grads = self.grads_fn(sequence, label, training=True) optimizer.apply_gradients(zip(grads, self.variables)) pbar.show(index, use_time=time.time() - start) # Compute the loss on the training data after one epoch for sequence, label, _ in training_data: sequence = tf.cast(sequence, dtype=tf.float32) label = tf.cast(label, dtype=tf.int64) train_los = self.loss_fn(sequence, label, training=False) train_acc = self.get_accuracy(sequence, label, training=False) train_loss.append(train_los) train_accuracy.append(train_acc) self.history['train_loss'].append(np.mean(train_loss)) self.history['train_accuracy'].append(np.mean(train_accuracy)) # Compute the loss on the eval data after one epoch for sequence, label, _ in eval_data: sequence = tf.cast(sequence, dtype=tf.float32) label = tf.cast(label, dtype=tf.int64) eval_los = self.loss_fn(sequence, label, training=False) eval_acc = self.get_accuracy(sequence, label, training=False) eval_loss.append(eval_los) eval_accuracy.append(eval_acc) self.history['eval_loss'].append(np.mean(eval_loss)) self.history['eval_accuracy'].append(np.mean(eval_accuracy)) # Print train and eval losses if (i == 0) | ((i + 1) % verbose == 0): print( 'Epoch %d - train_loss: %4f - eval_loss: %4f - train_acc:%4f - eval_acc:%4f' % (i + 1, self.history['train_loss'][-1], self.history['eval_loss'][-1], self.history['train_accuracy'][-1], self.history['eval_accuracy'][-1])) # Check for early stopping if self.history['eval_loss'][-1] < best_loss: best_loss = self.history['eval_loss'][-1] count = early_stopping_rounds else: count -= 1 if count == 0: break # 画出loss_acc曲线 loss_acc_plot(history=self.history)
def fit(model, training_iter, eval_iter, num_epoch, pbar, num_train_steps, verbose=1): # ------------------判断CUDA模式---------------------- device = torch.device(args.device if torch.cuda.is_available() and not args.no_cuda else "cpu") # ---------------------优化器------------------------- param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] t_total = num_train_steps ## ---------------------GPU半精度fp16----------------------------- if args.fp16: try: from apex.optimizers import FP16_Optimizer from apex.optimizers import FusedAdam except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) optimizer = FusedAdam(optimizer_grouped_parameters, lr=args.learning_rate, bias_correction=False, max_grad_norm=1.0) if args.loss_scale == 0: optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True) else: optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale) ## ------------------------GPU单精度fp32--------------------------- else: optimizer = BertAdam(optimizer_grouped_parameters, lr=args.learning_rate, warmup=args.warmup_proportion, t_total=t_total) # ---------------------模型初始化---------------------- if args.fp16: model.half() model.to(device) train_losses = [] eval_losses = [] train_accuracy = [] eval_accuracy = [] history = { "train_loss": train_losses, "train_acc": train_accuracy, "eval_loss": eval_losses, "eval_acc": eval_accuracy } # ------------------------训练------------------------------ best_f1 = 0 start = time.time() global_step = 0 for e in range(num_epoch): model.train() for step, batch in enumerate(training_iter): batch = tuple(t.to(device) for t in batch) input_ids, input_mask, segment_ids, label_ids, output_mask = batch # print("input_id", input_ids) # print("input_mask", input_mask) # print("segment_id", segment_ids) bert_encode = model(input_ids, segment_ids, input_mask).cpu() train_loss = model.loss_fn(bert_encode=bert_encode, tags=label_ids, output_mask=output_mask) if args.gradient_accumulation_steps > 1: train_loss = train_loss / args.gradient_accumulation_steps if args.fp16: optimizer.backward(train_loss) else: train_loss.backward() if (step + 1) % args.gradient_accumulation_steps == 0: # modify learning rate with special warm up BERT uses lr_this_step = args.learning_rate * warmup_linear( global_step / t_total, args.warmup_proportion) for param_group in optimizer.param_groups: param_group['lr'] = lr_this_step optimizer.step() optimizer.zero_grad() global_step += 1 predicts = model.predict(bert_encode, output_mask) label_ids = label_ids.view(1, -1) label_ids = label_ids[label_ids != -1] label_ids = label_ids.cpu() train_acc, f1 = model.acc_f1(predicts, label_ids) pbar.show_process(train_acc, train_loss.item(), f1, time.time() - start, step) # -----------------------验证---------------------------- model.eval() count = 0 y_predicts, y_labels = [], [] eval_loss, eval_acc, eval_f1 = 0, 0, 0 with torch.no_grad(): for step, batch in enumerate(eval_iter): batch = tuple(t.to(device) for t in batch) input_ids, input_mask, segment_ids, label_ids, output_mask = batch bert_encode = model(input_ids, segment_ids, input_mask).cpu() eval_los = model.loss_fn(bert_encode=bert_encode, tags=label_ids, output_mask=output_mask) eval_loss = eval_los + eval_loss count += 1 predicts = model.predict(bert_encode, output_mask) y_predicts.append(predicts) label_ids = label_ids.view(1, -1) label_ids = label_ids[label_ids != -1] y_labels.append(label_ids) eval_predicted = torch.cat(y_predicts, dim=0).cpu() eval_labeled = torch.cat(y_labels, dim=0).cpu() eval_acc, eval_f1 = model.acc_f1(eval_predicted, eval_labeled) model.class_report(eval_predicted, eval_labeled) logger.info( '\n\nEpoch %d - train_loss: %4f - eval_loss: %4f - train_acc:%4f - eval_acc:%4f - eval_f1:%4f\n' % (e + 1, train_loss.item(), eval_loss.item() / count, train_acc, eval_acc, eval_f1)) # 保存最好的模型 if eval_f1 > best_f1: best_f1 = eval_f1 save_model(model, args.output_dir) if e % verbose == 0: train_losses.append(train_loss.item()) train_accuracy.append(train_acc) eval_losses.append(eval_loss.item() / count) eval_accuracy.append(eval_acc) loss_acc_plot(history)
def fit(model, training_iter, eval_iter, num_epoch, pbar, lr_decay_mode, initial_lr, verbose=1): model.apply(weights_init) if use_cuda: model.cuda() optimizer = torch.optim.Adam(model.parameters(), lr=initial_lr) loss_fn = nn.CrossEntropyLoss() train_losses = [] eval_losses = [] train_accuracy = [] eval_accuracy = [] history = { "train_loss": train_losses, "train_acc": train_accuracy, "eval_loss": eval_losses, "eval_acc": eval_accuracy } best_f1 = 0 start = time.time() for e in range(num_epoch): if e > 0: lr_update(optimizer=optimizer, epoch=e, lr_decay_mode=lr_decay_mode) model.train() for index, (inputs, label) in enumerate(training_iter): if config.use_mem_track: gpu_tracker.track() if use_cuda: inputs = Variable(inputs.cuda()) label = Variable(label.squeeze(1).cuda()) y_preds = model(inputs) train_loss = loss_fn(y_preds, label) optimizer.zero_grad() train_loss.backward() optimizer.step() train_acc, _ = model.evaluate(y_preds, label) pbar.show_process(train_acc, train_loss.data, time.time() - start, index) if config.use_mem_track: gpu_tracker.track() if use_cuda: torch.cuda.empty_cache() model.eval() count = 0 y_predicts, y_labels = [], [] eval_loss, eval_acc, eval_f1 = 0, 0, 0 with torch.no_grad(): for eval_inputs, eval_label in eval_iter: if use_cuda: eval_inputs, eval_label = eval_inputs.cuda( ), eval_label.squeeze(1).cuda() eval_y_preds = model(eval_inputs) eval_loss += loss_fn(eval_y_preds, eval_label).data y_predicts.append(eval_y_preds) y_labels.append(eval_label) count += 1 eval_predicted = torch.cat(y_predicts, dim=0) eval_labeled = torch.cat(y_labels, dim=0) eval_acc, eval_f1 = model.evaluate(eval_predicted, eval_labeled) model.class_report(eval_predicted, eval_labeled) logger.info( '\n\nEpoch %d - train_loss: %4f - eval_loss: %4f - train_acc:%4f - eval_acc:%4f - eval_f1:%4f\n' % (e + 1, train_loss.data, eval_loss / count, train_acc, eval_acc, eval_f1)) # 保存最好的模型 if eval_f1 > best_f1: best_f1 = eval_f1 model.save() if e % verbose == 0: train_losses.append(train_loss.data) train_accuracy.append(train_acc) eval_losses.append(eval_loss / count) eval_accuracy.append(eval_acc) loss_acc_plot(history)