def evaluate(logger, device, model, criterion, dev_data_loader): """ 验证集评估函数,分别计算f1、precision、recall """ model.eval() start_time = time.time() loss_sum = 0.0 correct_preds = 0 all_predicts = [] all_labels = [] with torch.no_grad(): for step, (batch_ids, batch_masks, batch_segments, batch_labels) in enumerate(tqdm(dev_data_loader)): ids, masks, segments, labels = batch_ids.to(device), batch_masks.to(device), batch_segments.to( device), batch_labels.to(device) logits, probabilities = model(ids, masks, segments) loss = criterion(logits, labels) loss_sum += loss.item() correct_preds += correct_predictions(probabilities, labels) predicts = torch.argmax(probabilities, dim=1) all_predicts.extend(predicts.cpu()) all_labels.extend(batch_labels.cpu()) val_time = time.time() - start_time val_loss = loss_sum / len(dev_data_loader) val_accuracy = correct_preds / len(dev_data_loader.dataset) val_measures = cal_metrics(all_predicts, all_labels) val_measures['accuracy'] = val_accuracy # 打印验证集上的指标 res_str = '' for k, v in val_measures.items(): res_str += (k + ': %.3f ' % v) logger.info('loss: %.5f, %s' % (val_loss, res_str)) logger.info('time consumption of evaluating:%.2f(min)' % val_time) return val_measures, all_predicts
def train(configs, data_manager, logger): domain_classes = data_manager.domain_class_number intent_classes = data_manager.intent_class_number slot_classes = data_manager.slot_class_number id2slot = data_manager.id2slot learning_rate = configs.learning_rate epoch = configs.epoch batch_size = configs.batch_size optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate) bert_model = TFBertModel.from_pretrained('bert-base-chinese') tokenizer = BertTokenizer.from_pretrained('bert-base-chinese') X_train, att_mask_train, domain_train, intent_train, slot_train, \ X_val, att_mask_val, domain_val, intent_val, slot_val = data_manager.get_training_set() bilstm_crf_model = BiLSTM_CRFModel(configs, slot_classes) domain_model = DomainClassificationModel(configs, domain_classes) intent_model = IntentClassificationModel(configs, intent_classes) num_iterations = int(math.ceil(1.0 * len(X_train) / batch_size)) num_val_iterations = int(math.ceil(1.0 * len(X_val) / batch_size)) logger.info(('+' * 20) + 'training starting' + ('+' * 20)) for i in range(epoch): start_time = time.time() logger.info('epoch:{}/{}'.format(i + 1, epoch)) for iteration in tqdm(range(num_iterations)): X_train_batch, att_mask_train_batch, domain_train_batch, intent_train_batch, slot_train_batch \ = data_manager.next_batch(X_train, att_mask_train, domain_train, intent_train, slot_train, start_index=iteration * batch_size) inputs_length = tf.math.count_nonzero(X_train_batch, 1) # 获得bert模型的输出 bert_model_inputs = bert_model(X_train_batch, attention_mask=att_mask_train_batch)[0] with tf.GradientTape() as tape: # 槽位模型输入 slot_logits, slot_log_likelihood, slot_transition_params = bilstm_crf_model.call( inputs=bert_model_inputs, inputs_length=inputs_length, targets=slot_train_batch, training=1) slot_loss = -tf.reduce_mean(slot_log_likelihood) # 主题模型的输入 domain_logits = domain_model.call(inputs=bert_model_inputs[:, 0, :], training=1) domain_loss_vec = tf.keras.losses.sparse_categorical_crossentropy(y_pred=domain_logits, y_true=domain_train_batch) domain_loss = tf.reduce_mean(domain_loss_vec) # 意图模型的输入 intent_logits = intent_model.call(inputs=bert_model_inputs[:, 0, :], training=1) intent_loss_vec = tf.keras.losses.sparse_categorical_crossentropy(y_pred=intent_logits, y_true=intent_train_batch) intent_loss = tf.reduce_mean(intent_loss_vec) total_loss = domain_loss + intent_loss + 2 * slot_loss # 参数列表 trainable_variables = bilstm_crf_model.trainable_variables + domain_model.trainable_variables + intent_model.trainable_variables # 定义好参加梯度的参数 gradients = tape.gradient(total_loss, trainable_variables) # 反向传播,自动微分计算 optimizer.apply_gradients(zip(gradients, trainable_variables)) if iteration % configs.print_per_batch == 0 and iteration != 0: domain_predictions = tf.argmax(domain_logits, axis=-1) intent_predictions = tf.argmax(intent_logits, axis=-1) domain_measures = cal_metrics(y_true=domain_train_batch, y_pred=domain_predictions) intent_measures = cal_metrics(y_true=intent_train_batch, y_pred=intent_predictions) batch_pred_sequence, _ = crf_decode(slot_logits, slot_transition_params, inputs_length) slot_measures = cal_slots_metrics(X_train_batch, slot_train_batch, batch_pred_sequence, id2slot, tokenizer) domain_str = '' for k, v in domain_measures.items(): domain_str += (k + ': %.3f ' % v) logger.info('training batch: {}'.format (iteration)) logger.info('domain_loss: %.5f, %s' % (domain_loss, domain_str)) intent_str = '' for k, v in intent_measures.items(): intent_str += (k + ': %.3f ' % v) logger.info('intent_loss: %.5f, %s' % (intent_loss, intent_str)) slot_str = '' for k, v in slot_measures.items(): slot_str += (k + ': %.3f ' % v) logger.info('slot_loss: %.5f, %s' % (slot_loss, slot_str)) # validation logger.info('start evaluate engines...') slot_val_results = {'precision': 0, 'recall': 0, 'f1': 0} domain_val_results = {'precision': 0, 'recall': 0, 'f1': 0} intent_val_results = {'precision': 0, 'recall': 0, 'f1': 0} for iteration in tqdm(range(num_val_iterations)): X_val_batch, att_mask_val_batch, domain_val_batch, intent_val_batch, slot_val_batch \ = data_manager.next_batch(X_val, att_mask_val, domain_val, intent_val, slot_val, start_index=iteration * batch_size) inputs_length = tf.math.count_nonzero(X_val_batch, 1) # 获得bert模型的输出 bert_model_inputs = bert_model(X_val_batch, attention_mask=att_mask_val_batch)[0] # 槽位模型预测 slot_logits, slot_log_likelihood, slot_transition_params = bilstm_crf_model.call( inputs=bert_model_inputs, inputs_length=inputs_length, targets=slot_val_batch) batch_pred_sequence, _ = crf_decode(slot_logits, slot_transition_params, inputs_length) slot_measures = cal_slots_metrics(X_val_batch, slot_val_batch, batch_pred_sequence, id2slot, tokenizer) # 主题模型的预测 domain_logits = domain_model.call(inputs=bert_model_inputs[:, 0, :]) domain_predictions = tf.argmax(domain_logits, axis=-1) domain_measures = cal_metrics(y_true=domain_val_batch, y_pred=domain_predictions) # 意图模型的预测 intent_logits = intent_model.call(inputs=bert_model_inputs[:, 0, :]) intent_predictions = tf.argmax(intent_logits, axis=-1) intent_measures = cal_metrics(y_true=intent_val_batch, y_pred=intent_predictions) for k, v in slot_measures.items(): slot_val_results[k] += v for k, v in domain_measures.items(): domain_val_results[k] += v for k, v in intent_measures.items(): intent_val_results[k] += v time_span = (time.time() - start_time) / 60 val_slot_str = '' val_domain_str = '' val_intent_str = '' for k, v in slot_val_results.items(): slot_val_results[k] /= num_val_iterations val_slot_str += (k + ': %.3f ' % slot_val_results[k]) for k, v in domain_val_results.items(): domain_val_results[k] /= num_val_iterations val_domain_str += (k + ': %.3f ' % domain_val_results[k]) for k, v in intent_val_results.items(): intent_val_results[k] /= num_val_iterations val_intent_str += (k + ': %.3f ' % intent_val_results[k]) logger.info('slot: {}'.format(val_slot_str)) logger.info('domain: {}'.format(val_domain_str)) logger.info('intent: {}'.format(val_intent_str)) logger.info('time consumption:%.2f(min)' % time_span)
def train(data_manager, logger): embedding_dim = data_manager.embedding_dim num_classes = data_manager.max_label_number seq_length = data_manager.max_sequence_length train_file = classifier_config['train_file'] dev_file = classifier_config['dev_file'] train_df = pd.read_csv(train_file).sample(frac=1) if dev_file is '': # split the data into train and validation set train_df, dev_df = train_df[:int(len(train_df) * 0.9 )], train_df[int(len(train_df) * 0.9):] else: dev_df = pd.read_csv(dev_file).sample(frac=1) train_dataset = data_manager.get_dataset(train_df, step='train') dev_dataset = data_manager.get_dataset(dev_df) vocab_size = data_manager.vocab_size embedding_method = classifier_config['embedding_method'] if embedding_method == 'Bert': from transformers import TFBertModel bert_model = TFBertModel.from_pretrained( 'bert-base-multilingual-cased') else: bert_model = None checkpoints_dir = classifier_config['checkpoints_dir'] checkpoint_name = classifier_config['checkpoint_name'] num_filters = classifier_config['num_filters'] learning_rate = classifier_config['learning_rate'] epoch = classifier_config['epoch'] max_to_keep = classifier_config['max_to_keep'] print_per_batch = classifier_config['print_per_batch'] is_early_stop = classifier_config['is_early_stop'] patient = classifier_config['patient'] hidden_dim = classifier_config['hidden_dim'] classifier = classifier_config['classifier'] reverse_classes = { str(class_id): class_name for class_name, class_id in data_manager.class_id.items() } best_f1_val = 0.0 best_at_epoch = 0 unprocessed = 0 batch_size = data_manager.batch_size very_start_time = time.time() loss_obj = FocalLoss() if classifier_config['use_focal_loss'] else None optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate) # 载入模型 if classifier == 'textcnn': from engines.models.textcnn import TextCNN model = TextCNN(seq_length, num_filters, num_classes, embedding_dim, vocab_size) elif classifier == 'textrcnn': from engines.models.textrcnn import TextRCNN model = TextRCNN(seq_length, num_classes, hidden_dim, embedding_dim, vocab_size) elif classifier == 'textrnn': from engines.models.textrnn import TextRNN model = TextRNN(seq_length, num_classes, hidden_dim, embedding_dim, vocab_size) else: raise Exception('config model is not exist') checkpoint = tf.train.Checkpoint(model=model) checkpoint_manager = tf.train.CheckpointManager( checkpoint, directory=checkpoints_dir, checkpoint_name=checkpoint_name, max_to_keep=max_to_keep) checkpoint.restore(checkpoint_manager.latest_checkpoint) if checkpoint_manager.latest_checkpoint: print("Restored from {}".format(checkpoint_manager.latest_checkpoint)) else: print("Initializing from scratch.") logger.info(('+' * 20) + 'training starting' + ('+' * 20)) for i in range(epoch): start_time = time.time() logger.info('epoch:{}/{}'.format(i + 1, epoch)) for step, batch in tqdm( train_dataset.shuffle( len(train_dataset)).batch(batch_size).enumerate()): if embedding_method == 'Bert': X_train_batch, y_train_batch = batch X_train_batch = bert_model(X_train_batch)[0] else: X_train_batch, y_train_batch = batch with tf.GradientTape() as tape: logits = model(X_train_batch, training=1) if classifier_config['use_focal_loss']: loss_vec = loss_obj.call(y_true=y_train_batch, y_pred=logits) else: loss_vec = tf.keras.losses.categorical_crossentropy( y_true=y_train_batch, y_pred=logits) loss = tf.reduce_mean(loss_vec) # 定义好参加梯度的参数 gradients = tape.gradient(loss, model.trainable_variables) # 反向传播,自动微分计算 optimizer.apply_gradients(zip(gradients, model.trainable_variables)) if step % print_per_batch == 0 and step != 0: predictions = tf.argmax(logits, axis=-1).numpy() y_train_batch = tf.argmax(y_train_batch, axis=-1).numpy() measures, _ = cal_metrics(y_true=y_train_batch, y_pred=predictions) res_str = '' for k, v in measures.items(): res_str += (k + ': %.3f ' % v) logger.info('training batch: %5d, loss: %.5f, %s' % (step, loss, res_str)) # validation logger.info('start evaluate engines...') y_true, y_pred = np.array([]), np.array([]) loss_values = [] for dev_batch in tqdm(dev_dataset.batch(batch_size)): if embedding_method == 'Bert': X_val_batch, y_val_batch = dev_batch X_val_batch = bert_model(X_val_batch)[0] else: X_val_batch, y_val_batch = dev_batch logits = model(X_val_batch) val_loss_vec = tf.keras.losses.categorical_crossentropy( y_true=y_val_batch, y_pred=logits) val_loss = tf.reduce_mean(val_loss_vec) predictions = tf.argmax(logits, axis=-1) y_val_batch = tf.argmax(y_val_batch, axis=-1) y_true = np.append(y_true, y_val_batch) y_pred = np.append(y_pred, predictions) loss_values.append(val_loss) measures, each_classes = cal_metrics(y_true=y_true, y_pred=y_pred) # 打印每一个类别的指标 classes_val_str = '' for k, v in each_classes.items(): if k in reverse_classes: classes_val_str += ('\n' + reverse_classes[k] + ': ' + str(each_classes[k])) logger.info(classes_val_str) # 打印损失函数 val_res_str = 'loss: %.3f ' % np.mean(loss_values) for k, v in measures.items(): val_res_str += (k + ': %.3f ' % measures[k]) time_span = (time.time() - start_time) / 60 logger.info('time consumption:%.2f(min), %s' % (time_span, val_res_str)) if measures['f1'] > best_f1_val: unprocessed = 0 best_f1_val = measures['f1'] best_at_epoch = i + 1 checkpoint_manager.save() logger.info('saved the new best model with f1: %.3f' % best_f1_val) else: unprocessed += 1 if is_early_stop: if unprocessed >= patient: logger.info( 'early stopped, no progress obtained within {} epochs'. format(patient)) logger.info('overall best f1 is {} at {} epoch'.format( best_f1_val, best_at_epoch)) logger.info('total training time consumption: %.3f(min)' % ((time.time() - very_start_time) / 60)) return logger.info('overall best f1 is {} at {} epoch'.format( best_f1_val, best_at_epoch)) logger.info('total training time consumption: %.3f(min)' % ((time.time() - very_start_time) / 60))
def train(device, logger): # 定义各个参数 batch_size = 128 epoch = 10 learning_rate = 0.0004 patience = 3 print_per_batch = 40 folds = 5 test_predicts_folds = [0] * folds # 加载训练语料 train_query_file = 'datasets/train/train.query.tsv' train_reply_file = 'datasets/train/train.reply.tsv' train_left = pd.read_csv(train_query_file, sep='\t', header=None) train_left.columns = ['id', 'query'] train_right = pd.read_csv(train_reply_file, sep='\t', header=None) train_right.columns = ['id', 'id_sub', 'reply', 'label'] train_data = train_left.merge(train_right, how='left') train_data['reply'] = train_data['reply'].fillna('好的') oof = np.zeros((len(train_data), 1)) # 加载测试语料 test_query_file = 'datasets/test/test.query.tsv' test_reply_file = 'datasets/test/test.reply.tsv' test_left = pd.read_csv(test_query_file, sep='\t', header=None, encoding='gbk') test_left.columns = ['id', 'query'] test_right = pd.read_csv(test_reply_file, sep='\t', header=None, encoding='gbk') test_right.columns = ['id', 'id_sub', 'reply'] test_data = test_left.merge(test_right, how='left') test_data['label'] = 666 test_data_manger = DataPrecessForSentence(test_data, logger) logger.info('test_data_length:{}\n'.format(len(test_data_manger))) test_loader = DataLoader(test_data_manger, shuffle=False, batch_size=batch_size) # 交叉熵损失函数 criterion = torch.nn.CrossEntropyLoss() # N折交叉验证 gkf = GroupKFold(n_splits=5).split(X=train_data.reply, groups=train_data.id) for fold, (train_idx, valid_idx) in enumerate(gkf): best_f1 = 0.0 logger.info('fold:{}/{}'.format(fold + 1, folds)) train_data_manger = DataPrecessForSentence(train_data.iloc[train_idx], logger) logger.info('train_data_length:{}\n'.format(len(train_data_manger))) train_loader = DataLoader(train_data_manger, shuffle=True, batch_size=batch_size) val_data_manger = DataPrecessForSentence(train_data.iloc[valid_idx], logger) logger.info('dev_data_length:{}\n'.format(len(val_data_manger))) val_loader = DataLoader(val_data_manger, shuffle=False, batch_size=batch_size) model = BertwwmModel(device).to(device) params = list(model.parameters()) optimizer = AdamW(params, lr=learning_rate) # 定义梯度策略 scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode="max", factor=0.5, patience=0) for i in range(epoch): train_start = time.time() logger.info('epoch:{}/{}'.format(i + 1, epoch)) loss, loss_sum = 0.0, 0.0 correct_preds = 0 model.train() for step, (batch_ids, batch_masks, batch_segments, batch_labels) in enumerate(tqdm(train_loader)): ids, masks, segments, labels = batch_ids.to(device), batch_masks.to(device), batch_segments.to( device), batch_labels.to(device) optimizer.zero_grad() logits, probabilities = model(ids, masks, segments) loss = criterion(logits, labels) loss.backward() optimizer.step() loss_sum += loss.item() correct_preds += correct_predictions(probabilities, labels) # 打印训练过程中的指标 if step % print_per_batch == 0 and step != 0: predicts = torch.argmax(probabilities, dim=1) measures = cal_metrics(predicts.cpu(), labels.cpu()) res_str = '' for k, v in measures.items(): res_str += (k + ': %.3f ' % v) logger.info('training step: %5d, loss: %.5f, %s' % (step, loss, res_str)) train_time = (time.time() - train_start) / 60 train_accuracy = correct_preds / len(train_loader.dataset) scheduler.step(train_accuracy) logger.info('time consumption of training:%.2f(min)' % train_time) logger.info('start evaluate model...') val_measures, val_label_results = evaluate(logger, device, model, criterion, val_loader) patience_counter = 0 if val_measures['f1'] >= best_f1 and val_measures['f1'] > 0.70: logger.info('find the new best model with f1 in fold %d: %.3f' % (fold + 1, best_f1)) patience_counter = 0 best_f1 = val_measures['f1'] logger.info('start test model...') test_label_results = test(logger, device, model, test_loader) # 本次对test数据集的预测记录在test_predicts_folds中 test_predicts_folds[fold] = test_label_results # 本次验证集的预测数据记录在oof中 oof[valid_idx] = [[i] for i in val_label_results] else: patience_counter += 1 if patience_counter >= patience: logger('Early stopping: patience limit reached, stopping...') break outputs = compute_output_arrays(train_data, 'label') best_f1, best_threshold = search_f1(outputs, oof) logger.info('best_f1 is %.3f, best_threshold is %.3f' % (best_f1, best_threshold)) sub_predicts = np.average(test_predicts_folds, axis=0) sub_predicts = sub_predicts > best_threshold test_data['label'] = sub_predicts.astype(int) test_data[['id', 'id_sub', 'label']].to_csv('./submission_file/submission_bertwwm_esim_fgm.csv', index=False, header=None, sep='\t')
def train(data_manager, logger): embedding_dim = data_manager.embedding_dim num_classes = data_manager.max_label_number seq_length = data_manager.max_sequence_length checkpoints_dir = classifier_config['checkpoints_dir'] checkpoint_name = classifier_config['checkpoint_name'] num_filters = classifier_config['num_filters'] learning_rate = classifier_config['learning_rate'] epoch = classifier_config['epoch'] max_to_keep = classifier_config['max_to_keep'] print_per_batch = classifier_config['print_per_batch'] is_early_stop = classifier_config['is_early_stop'] patient = classifier_config['patient'] hidden_dim = classifier_config['hidden_dim'] classifier = classifier_config['classifier'] best_f1_val = 0.0 best_at_epoch = 0 unprocessed = 0 batch_size = data_manager.batch_size very_start_time = time.time() loss_obj = FocalLoss() if classifier_config['use_focal_loss'] else None optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate) X_train, y_train, X_val, y_val = data_manager.get_training_set() # 载入模型 if classifier == 'textcnn': from engines.models.textcnn import TextCNN model = TextCNN(seq_length, num_filters, num_classes, embedding_dim) elif classifier == 'textrcnn': from engines.models.textrcnn import TextRCNN model = TextRCNN(seq_length, num_classes, hidden_dim, embedding_dim) else: raise Exception('config model is not exist') checkpoint = tf.train.Checkpoint(model=model) checkpoint_manager = tf.train.CheckpointManager( checkpoint, directory=checkpoints_dir, checkpoint_name=checkpoint_name, max_to_keep=max_to_keep) num_iterations = int(math.ceil(1.0 * len(X_train) / batch_size)) num_val_iterations = int(math.ceil(1.0 * len(X_val) / batch_size)) logger.info(('+' * 20) + 'training starting' + ('+' * 20)) for i in range(epoch): start_time = time.time() # shuffle train at each epoch sh_index = np.arange(len(X_train)) np.random.shuffle(sh_index) X_train = X_train[sh_index] y_train = y_train[sh_index] logger.info('epoch:{}/{}'.format(i + 1, epoch)) for iteration in tqdm(range(num_iterations)): X_train_batch, y_train_batch = data_manager.next_batch( X_train, y_train, start_index=iteration * batch_size) with tf.GradientTape() as tape: logits = model.call(X_train_batch, training=1) if classifier_config['use_focal_loss']: loss_vec = loss_obj.call(y_true=y_train_batch, y_pred=logits) else: loss_vec = tf.keras.losses.categorical_crossentropy( y_true=y_train_batch, y_pred=logits) loss = tf.reduce_mean(loss_vec) # 定义好参加梯度的参数 gradients = tape.gradient(loss, model.trainable_variables) # 反向传播,自动微分计算 optimizer.apply_gradients(zip(gradients, model.trainable_variables)) if iteration % print_per_batch == 0 and iteration != 0: predictions = tf.argmax(logits, axis=-1) y_train_batch = tf.argmax(y_train_batch, axis=-1) measures = cal_metrics(y_true=y_train_batch, y_pred=predictions) res_str = '' for k, v in measures.items(): res_str += (k + ': %.3f ' % v) logger.info('training batch: %5d, loss: %.5f, %s' % (iteration, loss, res_str)) # validation logger.info('start evaluate engines...') val_results = {'precision': 0, 'recall': 0, 'f1': 0} for iteration in tqdm(range(num_val_iterations)): X_val_batch, y_val_batch = data_manager.next_batch( X_val, y_val, iteration * batch_size) logits = model.call(X_val_batch) predictions = tf.argmax(logits, axis=-1) y_val_batch = tf.argmax(y_val_batch, axis=-1) measures = cal_metrics(y_true=y_val_batch, y_pred=predictions) for k, v in measures.items(): val_results[k] += v time_span = (time.time() - start_time) / 60 val_res_str = '' dev_f1_avg = 0 for k, v in val_results.items(): val_results[k] /= num_val_iterations val_res_str += (k + ': %.3f ' % val_results[k]) if k == 'f1': dev_f1_avg = val_results[k] logger.info('time consumption:%.2f(min), %s' % (time_span, val_res_str)) if np.array(dev_f1_avg).mean() > best_f1_val: unprocessed = 0 best_f1_val = np.array(dev_f1_avg).mean() best_at_epoch = i + 1 checkpoint_manager.save() logger.info('saved the new best model with f1: %.3f' % best_f1_val) else: unprocessed += 1 if is_early_stop: if unprocessed >= patient: logger.info( 'early stopped, no progress obtained within {} epochs'. format(patient)) logger.info('overall best f1 is {} at {} epoch'.format( best_f1_val, best_at_epoch)) logger.info('total training time consumption: %.3f(min)' % ((time.time() - very_start_time) / 60)) return logger.info('overall best f1 is {} at {} epoch'.format( best_f1_val, best_at_epoch)) logger.info('total training time consumption: %.3f(min)' % ((time.time() - very_start_time) / 60))