def validate_one_batch(self, test_batch, task_name, log_writer, epoch): S, Q = test_batch Q_tag_ids = Q['tag_ids'] S_tag_ids = S['tag_ids'] Q_seq_len_list = Q['lens'] Q_seq_len_list_plus2 = [x + 2 for x in Q_seq_len_list] Q_tag_ids_padded = pad_tag_ids(Q_tag_ids) S_tag_ids_padded = pad_tag_ids(S_tag_ids) Q['tag_ids'] = Q_tag_ids_padded S['tag_ids'] = S_tag_ids_padded logits = self([S, Q]) loss = self.crf_loss(logits, Q_tag_ids_padded, Q_seq_len_list_plus2) pred_tags, pred_best_score = crf.crf_decode( potentials=logits, transition_params=self.trans_p, sequence_length=Q_seq_len_list_plus2) pred_tags_masked = seq_masking(pred_tags, Q_seq_len_list_plus2) p_tags_char, _ = get_id2tag_V2(pred_tags_masked, Q_seq_len_list_plus2, taskname=task_name) t_tags_char, _ = get_id2tag_V2(Q_tag_ids_padded, Q_seq_len_list_plus2, taskname=task_name) (P, R, F1), _ = evaluate(t_tags_char, p_tags_char, verbose=True) write_to_log(loss, P, R, F1, t_tags_char, log_writer, epoch) return (loss, pred_tags_masked, Q_tag_ids_padded, P, R, F1)
def __init__(self, batch_size, n_class, ball_num, w_size, embedding_size, words_size, hidden_size, layer_size): self._inputs = tf.keras.layers.Input( shape=(w_size, ball_num), batch_size=batch_size, name="red_inputs" ) self._tag_indices = tf.keras.layers.Input( shape=(ball_num, ), batch_size=batch_size, dtype=tf.int32, name="red_tag_indices" ) self._sequence_length = tf.keras.layers.Input( shape=(), batch_size=batch_size, dtype=tf.int32, name="sequence_length" ) # 构建特征抽取 embedding = tf.keras.layers.Embedding(words_size, embedding_size)(self._inputs) first_lstm = tf.convert_to_tensor( [tf.keras.layers.LSTM(hidden_size)(embedding[:, :, i, :]) for i in range(ball_num)] ) first_lstm = tf.transpose(first_lstm, perm=[1, 0, 2]) second_lstm = None for _ in range(layer_size): second_lstm = tf.keras.layers.LSTM(hidden_size, return_sequences=True)(first_lstm) self._outputs = tf.keras.layers.Dense(n_class)(second_lstm) # 构建损失函数 self._log_likelihood, self._transition_params = crf_log_likelihood( self._outputs, self._tag_indices, self._sequence_length ) self._loss = tf.reduce_sum(-self._log_likelihood) # 构建预测 self._pred_sequence, self._viterbi_score = crf_decode( self._outputs, self._transition_params, self._sequence_length )
def get_viterbi_decoding(self, potentials, sequence_length): # decode_tags: A [batch_size, max_seq_len] matrix, with dtype `tf.int32` decode_tags, best_score = crf_decode( potentials, self.chain_kernel, sequence_length ) return decode_tags, best_score
def predict_one(self, sentence): """ 对输入的句子进行ner识别,取batch中的第一行结果 :param sentence: :return: """ if self.configs.use_bert: X, y, att_mask, Sentence = self.dataManager.prepare_single_sentence(sentence) if self.configs.finetune: model_inputs = (X, att_mask) else: model_inputs = self.bert_model(X, attention_mask=att_mask)[0] else: X, y, Sentence = self.dataManager.prepare_single_sentence(sentence) model_inputs = X inputs_length = tf.math.count_nonzero(X, 1) logits, log_likelihood, transition_params = self.ner_model( inputs=model_inputs, inputs_length=inputs_length, targets=y) label_predicts, _ = crf_decode(logits, transition_params, inputs_length) label_predicts = label_predicts.numpy() sentence = Sentence[0, 0:inputs_length[0]] y_pred = [str(self.dataManager.id2label[val]) for val in label_predicts[0][0:inputs_length[0]]] if self.configs.use_bert: # 去掉[CLS]和[SEP]对应的位置 y_pred = y_pred[1:-1] entities, suffixes, indices = extract_entity(sentence, y_pred, self.dataManager) return entities, suffixes, indices
def accuracy(y_true, y_pred): shape = tf.shape(y_pred) sequence_lengths = tf.ones(shape[0], dtype=tf.int32) * (shape[1]) viterbi_sequence, _ = crf_decode(y_pred, self.transitions, sequence_lengths) output = tf.keras.backend.one_hot(viterbi_sequence, self.output_dim) return tf.keras.metrics.categorical_accuracy(y_true, output)
def predict_one_batch(self, test_batch): seq_ids_padded, tag_ids_padded, seq_len_list = get_train_data_from_batch(test_batch) logits = self(seq_ids_padded) loss = self.crf_loss(logits, tag_ids_padded, seq_len_list) pred_tags, pred_best_score = crf.crf_decode(potentials=logits, transition_params=self.trans_p, sequence_length=seq_len_list) pred_tags_masked = seq_masking(pred_tags, seq_len_list) return (loss, pred_tags_masked,tag_ids_padded)
def get_viterbi_decoding(self, potentials, sequence_length): # decode_tags: A [batch_size, max_seq_len] matrix, with dtype `tf.int32` decode_tags, best_score = crf_decode(potentials, self.chain_kernel, sequence_length) # covert to one-hot encoding decode_tags = tf.keras.backend.one_hot(decode_tags, self.units) return decode_tags, best_score
def inner_train_one_step(self, batches, epochNum, task_name, log_writer, log_dir): ''' :param self: :param batches: one batch data: [[sentence],[sentence],....] sentence=[[chars],[charids],[tags],[tag_ids]] :param inner_epochNum: :return: ''' # tf.summary.trace_on(graph=True,profiler=True) # 开启Trace(可选) batch_Nums = len(batches) losses, P_ts, R_ts, F1_ts = [], [], [], [] # =====run model======= with tqdm(total=batch_Nums) as bar: for batch_num in range(batch_Nums): batch = batches[batch_num] seq_ids_padded, tag_ids_padded, seq_len_list = get_train_data_from_batch( batch) with tf.GradientTape() as tape: # print(batch[0]) # 调试用 logits = self(seq_ids_padded) loss = self.crf_loss(logits, tag_ids_padded, seq_len_list) pred_tags, pred_best_score = crf.crf_decode( potentials=logits, transition_params=self.trans_p, sequence_length=seq_len_list) grads = tape.gradient(loss, self.trainable_variables) self.optimizer.apply_gradients( zip(grads, self.trainable_variables)) # optimizer.minimize(loss, [myModel_bilstm.trainable_variables]) pred_tags_masked = seq_masking(pred_tags, seq_len_list) p_tags_char, p_tagsid_flatten = get_id2tag(pred_tags_masked, taskname=task_name) t_tags_char, t_tagsid_flatten = get_id2tag(tag_ids_padded, taskname=task_name) (P_t, R_t, F1_t), _ = evaluate(t_tags_char, p_tags_char, verbose=False) losses.append(loss) P_ts.append(P_t) R_ts.append(R_t) F1_ts.append(F1_t) print('train_loss:{}, train_P:{}'.format(loss, P_t)) bar.update(1) with log_writer.as_default(): tf.summary.scalar("loss", np.mean(losses), step=epochNum) tf.summary.scalar("P", np.mean(P_ts), step=epochNum) tf.summary.scalar("R", np.mean(R_ts), step=epochNum) tf.summary.scalar("F1", np.mean(F1_ts), step=epochNum)
def call(self, inputs): # if mask is not None: # mask = K.cast(mask, K.floatx()) seq_target, tag_target, seq_true, tag_true = inputs loss = self.multi_loss([seq_true, tag_true], [seq_target, tag_target]) self.add_loss(loss) seq_input_shape, tag_input_shape = tf.slice(tf.shape(seq_target), [0], [2]), tf.slice( tf.shape(tag_target), [0], [2]) seq_mask, tag_mask = tf.ones(seq_input_shape), tf.ones(tag_input_shape) seq_sequence_lengths, tag_sequence_lengths = K.sum( K.cast(seq_mask, 'int32'), axis=-1), K.sum(K.cast(tag_mask, 'int32'), axis=-1) # seq_length=y_pred.shape[1].value seq_target, _ = crf_decode(seq_target, self.trans_seq, seq_sequence_lengths) tag_target, _ = crf_decode(tag_target, self.trans_tag, tag_sequence_lengths) return [seq_target, tag_target]
def call(self, inputs, lengths=None): """ parameters: inputs [B, L, T] lengths [B] returns: [B, L] """ # inputs = inputs[:, 1:-1, :] shape = tf.shape(inputs) if lengths is None: lengths = tf.ones((shape[0], ), dtype=tf.int32) * shape[1] tags_id, _ = crf_decode(potentials=inputs, transition_params=self.transition_params, sequence_length=lengths) return tags_id
def call(self, inputs, sequence_lengths=None, **kwargs): sequences = tf.convert_to_tensor(inputs, dtype=self.dtype) if sequence_lengths is not None: assert len(sequence_lengths.shape) == 2 assert tf.convert_to_tensor(sequence_lengths).dtype == "int32" seq_len_shape = tf.convert_to_tensor( sequence_lengths).get_shape().as_list() assert seq_len_shape[1] == 1 self.sequence_lengths = tf.keras.backend.flatten(sequence_lengths) else: self.sequence_lengths = tf.ones( tf.shape(inputs)[0], dtype=tf.int32) * (tf.shape(inputs)[1]) viterbi_sequence, _ = crf_decode(sequences, self.transitions, self.sequence_lengths) output = tf.keras.backend.one_hot(viterbi_sequence, self.output_dim) return tf.keras.backend.in_train_phase(sequences, output)
def inner_train_one_step(self, batches, inner_iters, inner_epochNum, outer_epochNum, task_name, log_writer): ''' :param self: :param batches: one batch data: [[sentence],[sentence],....] sentence=[[chars],[charids],[tags],[tag_ids]] :param inner_epochNum: :return: ''' batches_len = len(batches) # =====run model======= for batch_num in range(batches_len): batch = batches[batch_num] seq_ids_padded, tag_ids_padded, seq_len_list = get_train_data_from_batch( batch) with tf.GradientTape() as tape: logits = self(seq_ids_padded) loss = self.crf_loss(logits, tag_ids_padded, seq_len_list) pred_tags, pred_best_score = crf.crf_decode( potentials=logits, transition_params=self.trans_p, sequence_length=seq_len_list) grads = tape.gradient(loss, self.trainable_variables) self.optimizer.apply_gradients(zip(grads, self.trainable_variables)) # optimizer.minimize(loss, [myModel_bilstm.trainable_variables]) pred_tags_masked = seq_masking(pred_tags, seq_len_list) p_tags_char, p_tagsid_flatten = get_id2tag(pred_tags_masked, taskname=task_name) t_tags_char, t_tagsid_flatten = get_id2tag(tag_ids_padded, taskname=task_name) (P_t, R_t, F1_t), _ = evaluate(t_tags_char, p_tags_char, verbose=False) with log_writer.as_default(): step = batch_num + 1 + inner_epochNum * batches_len tf.summary.scalar("loss", loss, step=inner_epochNum + outer_epochNum * inner_iters) tf.summary.scalar("P", P_t, step=inner_epochNum) tf.summary.scalar("R", R_t, step=inner_epochNum) tf.summary.scalar("F", F1_t, step=inner_epochNum) return (loss, P_t)
def inner_train_one_step(self, batches, inner_epochNum, ckpt_manager, log_writer=None): ''' :param self: :param batches: one batch data: [[sentence],[sentence],....] sentence=[[chars],[charids],[tags],[tag_ids]] :param inner_epochNum: :return: ''' batch_size = len(batches) print('========================batchsiez', batch_size) # =====run model======= with tqdm(total=batch_size) as bar: for batch_num in range(batch_size): batch = batches[batch_num] seq_ids_padded, tag_ids_padded, seq_len_list = get_train_data_from_batch(batch) with tf.GradientTape() as tape: logits = self(seq_ids_padded) loss = self.crf_loss(logits, tag_ids_padded, seq_len_list) pred_tags, pred_best_score = crf.crf_decode(potentials=logits, transition_params=self.trans_p, sequence_length=seq_len_list) grads = tape.gradient(loss, self.trainable_variables) self.optimizer.apply_gradients(zip(grads, self.trainable_variables)) # optimizer.minimize(loss, [myModel_bilstm.trainable_variables]) bar.update(1) pred_tags_masked = seq_masking(pred_tags, seq_len_list) p_tags_char, p_tagsid_flatten = get_id2tag(pred_tags_masked) t_tags_char, t_tagsid_flatten = get_id2tag(tag_ids_padded) try: (P_t, R_t, F1_t),_ = evaluate(t_tags_char, p_tags_char, verbose=True) except Exception as e: print(e) with log_writer.as_default(): step = batch_num + 1 + inner_epochNum * batch_size tf.summary.scalar("loss", loss, step=inner_epochNum) tf.summary.scalar("P", P_t, step=inner_epochNum) tf.summary.scalar("R", R_t, step=inner_epochNum) tf.summary.scalar("F", F1_t, step=inner_epochNum) ckpt_manager.save(checkpoint_number=inner_epochNum)
def validate_one_batches(self, test_batches, task_name, log_writer, epoch): seq_embeddings = test_batches['emb'] tag_ids = test_batches['tag_ids'] seq_len_list = test_batches['lens'] seq_len_list_plus2 = [x + 2 for x in seq_len_list] tag_ids_padded = pad_tag_ids(tag_ids) logits = self(seq_embeddings) loss = self.crf_loss(logits, tag_ids_padded, seq_len_list_plus2) pred_tags, pred_best_score = crf.crf_decode( potentials=logits, transition_params=self.trans_p, sequence_length=seq_len_list_plus2) pred_tags_masked = seq_masking(pred_tags, seq_len_list_plus2) p_tags_char, _ = get_id2tag_V2(pred_tags_masked, seq_len_list_plus2, taskname=task_name) t_tags_char, _ = get_id2tag_V2(tag_ids_padded, seq_len_list_plus2, taskname=task_name) (P, R, F1), _ = evaluate(t_tags_char, p_tags_char, verbose=True) write_to_log(loss, P, R, F1, t_tags_char, log_writer, epoch) return (loss, pred_tags_masked, tag_ids_padded, P, R, F1)
def train(configs, data_manager, logger): vocab_size = data_manager.max_token_number num_classes = data_manager.max_label_number learning_rate = configs.learning_rate max_to_keep = configs.checkpoints_max_to_keep checkpoints_dir = configs.checkpoints_dir checkpoint_name = configs.checkpoint_name best_f1_val = 0.0 best_at_epoch = 0 unprocessed = 0 very_start_time = time.time() epoch = configs.epoch batch_size = configs.batch_size # 优化器大致效果Adagrad>Adam>RMSprop>SGD if configs.optimizer == 'Adagrad': optimizer = tf.keras.optimizers.Adagrad(learning_rate=learning_rate) elif configs.optimizer == 'Adadelta': optimizer = tf.keras.optimizers.Adadelta(learning_rate=learning_rate) elif configs.optimizer == 'RMSprop': optimizer = tf.keras.optimizers.RMSprop(learning_rate=learning_rate) elif configs.optimizer == 'SGD': optimizer = tf.keras.optimizers.SGD(learning_rate=learning_rate) else: optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate) tokenizer = BertTokenizer.from_pretrained('bert-base-chinese') if configs.use_bert and not configs.finetune: bert_model = TFBertModel.from_pretrained('bert-base-chinese') else: bert_model = None train_dataset, val_dataset = data_manager.get_training_set() ner_model = NerModel(configs, vocab_size, num_classes) checkpoint = tf.train.Checkpoint(ner_model=ner_model) checkpoint_manager = tf.train.CheckpointManager( checkpoint, directory=checkpoints_dir, checkpoint_name=checkpoint_name, max_to_keep=max_to_keep) checkpoint.restore(checkpoint_manager.latest_checkpoint) if checkpoint_manager.latest_checkpoint: print('Restored from {}'.format(checkpoint_manager.latest_checkpoint)) else: print('Initializing from scratch.') num_val_iterations = int(math.ceil(1.0 * len(val_dataset) / batch_size)) logger.info(('+' * 20) + 'training starting' + ('+' * 20)) for i in range(epoch): start_time = time.time() logger.info('epoch:{}/{}'.format(i + 1, epoch)) for step, batch in tqdm( train_dataset.shuffle( len(train_dataset)).batch(batch_size).enumerate()): if configs.use_bert: X_train_batch, y_train_batch, att_mask_batch = batch if configs.finetune: # 如果微调 model_inputs = (X_train_batch, att_mask_batch) else: # 不进行微调,Bert只做特征的增强 model_inputs = bert_model(X_train_batch, attention_mask=att_mask_batch)[0] else: X_train_batch, y_train_batch = batch model_inputs = X_train_batch # 计算没有加入pad之前的句子的长度 inputs_length = tf.math.count_nonzero(X_train_batch, 1) with tf.GradientTape() as tape: logits, log_likelihood, transition_params = ner_model( inputs=model_inputs, inputs_length=inputs_length, targets=y_train_batch, training=1) loss = -tf.reduce_mean(log_likelihood) # 定义好参加梯度的参数 variables = ner_model.trainable_variables # 将Bert里面的pooler层的参数去掉 variables = [var for var in variables if 'pooler' not in var.name] gradients = tape.gradient(loss, variables) # 反向传播,自动微分计算 optimizer.apply_gradients(zip(gradients, variables)) if step % configs.print_per_batch == 0 and step != 0: batch_pred_sequence, _ = crf_decode(logits, transition_params, inputs_length) measures, _ = metrics(X_train_batch, y_train_batch, batch_pred_sequence, configs, data_manager, tokenizer) res_str = '' for k, v in measures.items(): res_str += (k + ': %.3f ' % v) logger.info('training batch: %5d, loss: %.5f, %s' % (step, loss, res_str)) # validation logger.info('start evaluate engines...') loss_values = [] val_results = {} val_labels_results = {} for label in data_manager.suffix: val_labels_results.setdefault(label, {}) for measure in configs.measuring_metrics: val_results[measure] = 0 for label, content in val_labels_results.items(): for measure in configs.measuring_metrics: val_labels_results[label][measure] = 0 for val_batch in tqdm(val_dataset.batch(batch_size)): if configs.use_bert: X_val_batch, y_val_batch, att_mask_batch = val_batch if configs.finetune: model_inputs = (X_val_batch, att_mask_batch) else: model_inputs = bert_model(X_val_batch, attention_mask=att_mask_batch)[0] else: X_val_batch, y_val_batch = val_batch model_inputs = X_val_batch inputs_length_val = tf.math.count_nonzero(X_val_batch, 1) logits_val, log_likelihood_val, transition_params_val = ner_model( inputs=model_inputs, inputs_length=inputs_length_val, targets=y_val_batch) val_loss = -tf.reduce_mean(log_likelihood_val) batch_pred_sequence_val, _ = crf_decode(logits_val, transition_params_val, inputs_length_val) measures, lab_measures = metrics(X_val_batch, y_val_batch, batch_pred_sequence_val, configs, data_manager, tokenizer) for k, v in measures.items(): val_results[k] += v for lab in lab_measures: for k, v in lab_measures[lab].items(): val_labels_results[lab][k] += v loss_values.append(val_loss) time_span = (time.time() - start_time) / 60 val_res_str = '' val_f1_avg = 0 for k, v in val_results.items(): val_results[k] /= num_val_iterations val_res_str += (k + ': %.3f ' % val_results[k]) if k == 'f1': val_f1_avg = val_results[k] for label, content in val_labels_results.items(): val_label_str = '' for k, v in content.items(): val_labels_results[label][k] /= num_val_iterations val_label_str += (k + ': %.3f ' % val_labels_results[label][k]) logger.info('label: %s, %s' % (label, val_label_str)) logger.info('time consumption:%.2f(min), %s' % (time_span, val_res_str)) if np.array(val_f1_avg).mean() > best_f1_val: unprocessed = 0 best_f1_val = np.array(val_f1_avg).mean() best_at_epoch = i + 1 checkpoint_manager.save() logger.info('saved the new best model with f1: %.3f' % best_f1_val) else: unprocessed += 1 if configs.is_early_stop: if unprocessed >= configs.patient: logger.info( 'early stopped, no progress obtained within {} epochs'. format(configs.patient)) logger.info('overall best f1 is {} at {} epoch'.format( best_f1_val, best_at_epoch)) logger.info('total training time consumption: %.3f(min)' % ((time.time() - very_start_time) / 60)) return logger.info('overall best f1 is {} at {} epoch'.format( best_f1_val, best_at_epoch)) logger.info('total training time consumption: %.3f(min)' % ((time.time() - very_start_time) / 60))
def train(configs, data_manager, logger): domain_classes = data_manager.domain_class_number intent_classes = data_manager.intent_class_number slot_classes = data_manager.slot_class_number id2slot = data_manager.id2slot learning_rate = configs.learning_rate epoch = configs.epoch batch_size = configs.batch_size optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate) bert_model = TFBertModel.from_pretrained('bert-base-chinese') tokenizer = BertTokenizer.from_pretrained('bert-base-chinese') X_train, att_mask_train, domain_train, intent_train, slot_train, \ X_val, att_mask_val, domain_val, intent_val, slot_val = data_manager.get_training_set() bilstm_crf_model = BiLSTM_CRFModel(configs, slot_classes) domain_model = DomainClassificationModel(configs, domain_classes) intent_model = IntentClassificationModel(configs, intent_classes) num_iterations = int(math.ceil(1.0 * len(X_train) / batch_size)) num_val_iterations = int(math.ceil(1.0 * len(X_val) / batch_size)) logger.info(('+' * 20) + 'training starting' + ('+' * 20)) for i in range(epoch): start_time = time.time() logger.info('epoch:{}/{}'.format(i + 1, epoch)) for iteration in tqdm(range(num_iterations)): X_train_batch, att_mask_train_batch, domain_train_batch, intent_train_batch, slot_train_batch \ = data_manager.next_batch(X_train, att_mask_train, domain_train, intent_train, slot_train, start_index=iteration * batch_size) inputs_length = tf.math.count_nonzero(X_train_batch, 1) # 获得bert模型的输出 bert_model_inputs = bert_model(X_train_batch, attention_mask=att_mask_train_batch)[0] with tf.GradientTape() as tape: # 槽位模型输入 slot_logits, slot_log_likelihood, slot_transition_params = bilstm_crf_model.call( inputs=bert_model_inputs, inputs_length=inputs_length, targets=slot_train_batch, training=1) slot_loss = -tf.reduce_mean(slot_log_likelihood) # 主题模型的输入 domain_logits = domain_model.call(inputs=bert_model_inputs[:, 0, :], training=1) domain_loss_vec = tf.keras.losses.sparse_categorical_crossentropy(y_pred=domain_logits, y_true=domain_train_batch) domain_loss = tf.reduce_mean(domain_loss_vec) # 意图模型的输入 intent_logits = intent_model.call(inputs=bert_model_inputs[:, 0, :], training=1) intent_loss_vec = tf.keras.losses.sparse_categorical_crossentropy(y_pred=intent_logits, y_true=intent_train_batch) intent_loss = tf.reduce_mean(intent_loss_vec) total_loss = domain_loss + intent_loss + 2 * slot_loss # 参数列表 trainable_variables = bilstm_crf_model.trainable_variables + domain_model.trainable_variables + intent_model.trainable_variables # 定义好参加梯度的参数 gradients = tape.gradient(total_loss, trainable_variables) # 反向传播,自动微分计算 optimizer.apply_gradients(zip(gradients, trainable_variables)) if iteration % configs.print_per_batch == 0 and iteration != 0: domain_predictions = tf.argmax(domain_logits, axis=-1) intent_predictions = tf.argmax(intent_logits, axis=-1) domain_measures = cal_metrics(y_true=domain_train_batch, y_pred=domain_predictions) intent_measures = cal_metrics(y_true=intent_train_batch, y_pred=intent_predictions) batch_pred_sequence, _ = crf_decode(slot_logits, slot_transition_params, inputs_length) slot_measures = cal_slots_metrics(X_train_batch, slot_train_batch, batch_pred_sequence, id2slot, tokenizer) domain_str = '' for k, v in domain_measures.items(): domain_str += (k + ': %.3f ' % v) logger.info('training batch: {}'.format (iteration)) logger.info('domain_loss: %.5f, %s' % (domain_loss, domain_str)) intent_str = '' for k, v in intent_measures.items(): intent_str += (k + ': %.3f ' % v) logger.info('intent_loss: %.5f, %s' % (intent_loss, intent_str)) slot_str = '' for k, v in slot_measures.items(): slot_str += (k + ': %.3f ' % v) logger.info('slot_loss: %.5f, %s' % (slot_loss, slot_str)) # validation logger.info('start evaluate engines...') slot_val_results = {'precision': 0, 'recall': 0, 'f1': 0} domain_val_results = {'precision': 0, 'recall': 0, 'f1': 0} intent_val_results = {'precision': 0, 'recall': 0, 'f1': 0} for iteration in tqdm(range(num_val_iterations)): X_val_batch, att_mask_val_batch, domain_val_batch, intent_val_batch, slot_val_batch \ = data_manager.next_batch(X_val, att_mask_val, domain_val, intent_val, slot_val, start_index=iteration * batch_size) inputs_length = tf.math.count_nonzero(X_val_batch, 1) # 获得bert模型的输出 bert_model_inputs = bert_model(X_val_batch, attention_mask=att_mask_val_batch)[0] # 槽位模型预测 slot_logits, slot_log_likelihood, slot_transition_params = bilstm_crf_model.call( inputs=bert_model_inputs, inputs_length=inputs_length, targets=slot_val_batch) batch_pred_sequence, _ = crf_decode(slot_logits, slot_transition_params, inputs_length) slot_measures = cal_slots_metrics(X_val_batch, slot_val_batch, batch_pred_sequence, id2slot, tokenizer) # 主题模型的预测 domain_logits = domain_model.call(inputs=bert_model_inputs[:, 0, :]) domain_predictions = tf.argmax(domain_logits, axis=-1) domain_measures = cal_metrics(y_true=domain_val_batch, y_pred=domain_predictions) # 意图模型的预测 intent_logits = intent_model.call(inputs=bert_model_inputs[:, 0, :]) intent_predictions = tf.argmax(intent_logits, axis=-1) intent_measures = cal_metrics(y_true=intent_val_batch, y_pred=intent_predictions) for k, v in slot_measures.items(): slot_val_results[k] += v for k, v in domain_measures.items(): domain_val_results[k] += v for k, v in intent_measures.items(): intent_val_results[k] += v time_span = (time.time() - start_time) / 60 val_slot_str = '' val_domain_str = '' val_intent_str = '' for k, v in slot_val_results.items(): slot_val_results[k] /= num_val_iterations val_slot_str += (k + ': %.3f ' % slot_val_results[k]) for k, v in domain_val_results.items(): domain_val_results[k] /= num_val_iterations val_domain_str += (k + ': %.3f ' % domain_val_results[k]) for k, v in intent_val_results.items(): intent_val_results[k] /= num_val_iterations val_intent_str += (k + ': %.3f ' % intent_val_results[k]) logger.info('slot: {}'.format(val_slot_str)) logger.info('domain: {}'.format(val_domain_str)) logger.info('intent: {}'.format(val_intent_str)) logger.info('time consumption:%.2f(min)' % time_span)
def inner_train_one_step(self, batches, inner_iters, inner_epochNum, outer_epochNum, task_name, log_writer, mod='pretrain'): ''' :param self: :param batches: one batch data: [[sentence],[sentence],....] sentence=[emb:[],chars:[],tags:[],tag_ids:[]] :param inner_epochNum: :return: ''' batches_len = len(batches) # =====run model======= for batch_num in range(batches_len): batch = batches[batch_num] seq_embeddings = batch['emb'] tag_ids = batch['tag_ids'] seq_len_list = batch['lens'] seq_len_list_plus2 = [x + 2 for x in seq_len_list] tag_ids_padded = pad_tag_ids(tag_ids) with tf.GradientTape(persistent=True) as tape: logits = self(seq_embeddings) loss = self.crf_loss(logits, tag_ids_padded, seq_len_list_plus2) pred_tags, pred_best_score = crf.crf_decode( potentials=logits, transition_params=self.trans_p, sequence_length=seq_len_list_plus2) grads = tape.gradient(loss, self.trans_p) self.optimizer.apply_gradients(zip(grads, self.trans_p)) grads = tape.gradient(loss, self.dense.trainable_variables) self.optimizer.apply_gradients( zip(grads, self.dense.trainable_variables)) if mod == 'pretrain': grads = tape.gradient(loss, self.BiLSTM.trainable_variables) self.optimizer.apply_gradients( zip(grads, self.trainable_variables)) del tape # optimizer.minimize(loss, [myModel_bilstm.trainable_variables]) pred_tags_masked = seq_masking(pred_tags, seq_len_list_plus2) p_tags_char, p_tagsid_flatten = get_id2tag_V2(pred_tags_masked, seq_len_list_plus2, taskname=task_name) t_tags_char, t_tagsid_flatten = get_id2tag_V2(tag_ids_padded, seq_len_list_plus2, taskname=task_name) (P_t, R_t, F1_t), _ = evaluate(t_tags_char, p_tags_char, verbose=False) with log_writer.as_default(): # step = batch_num + 1 + inner_epochNum * batches_len tf.summary.scalar("loss", loss, step=inner_epochNum + outer_epochNum * inner_iters) tf.summary.scalar("P", P_t, step=inner_epochNum + outer_epochNum * inner_iters) tf.summary.scalar("R", R_t, step=inner_epochNum + outer_epochNum * inner_iters) tf.summary.scalar("F", F1_t, step=inner_epochNum + outer_epochNum * inner_iters) return (loss, P_t, R_t, F1_t)
def train(configs, data_manager, logger): vocab_size = data_manager.max_token_number num_classes = data_manager.max_label_number learning_rate = configs.learning_rate max_to_keep = configs.checkpoints_max_to_keep checkpoints_dir = configs.checkpoints_dir checkpoint_name = configs.checkpoint_name best_f1_val = 0.0 best_at_epoch = 0 unprocessed = 0 very_start_time = time.time() epoch = configs.epoch batch_size = configs.batch_size # 优化器大致效果Adagrad>Adam>RMSprop>SGD if configs.optimizer == 'Adagrad': optimizer = tf.keras.optimizers.Adagrad(learning_rate=learning_rate) elif configs.optimizer == 'Adadelta': optimizer = tf.keras.optimizers.Adadelta(learning_rate=learning_rate) elif configs.optimizer == 'RMSprop': optimizer = tf.keras.optimizers.RMSprop(learning_rate=learning_rate) elif configs.optimizer == 'SGD': optimizer = tf.keras.optimizers.SGD(learning_rate=learning_rate) else: optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate) if configs.use_bert: bert_model = TFBertModel.from_pretrained('bert-base-chinese') tokenizer = BertTokenizer.from_pretrained('bert-base-chinese') X_train, y_train, att_mask_train, X_val, y_val, att_mask_val = data_manager.get_training_set( ) else: X_train, y_train, X_val, y_val = data_manager.get_training_set() att_mask_train, att_mask_val = np.array([]), np.array([]) bert_model, tokenizer = None, None bilstm_crf_model = BiLSTM_CRFModel(configs, vocab_size, num_classes, configs.use_bert) checkpoint = tf.train.Checkpoint(model=bilstm_crf_model) checkpoint_manager = tf.train.CheckpointManager( checkpoint, directory=checkpoints_dir, checkpoint_name=checkpoint_name, max_to_keep=max_to_keep) num_iterations = int(math.ceil(1.0 * len(X_train) / batch_size)) num_val_iterations = int(math.ceil(1.0 * len(X_val) / batch_size)) logger.info(('+' * 20) + 'training starting' + ('+' * 20)) for i in range(epoch): start_time = time.time() # shuffle train at each epoch sh_index = np.arange(len(X_train)) np.random.shuffle(sh_index) X_train = X_train[sh_index] y_train = y_train[sh_index] if configs.use_bert: att_mask_train = att_mask_train[sh_index] logger.info('epoch:{}/{}'.format(i + 1, epoch)) for iteration in tqdm(range(num_iterations)): if configs.use_bert: X_train_batch, y_train_batch, att_mask_batch = data_manager.next_batch( X_train, y_train, att_mask_train, start_index=iteration * batch_size) # 计算没有加入pad之前的句子的长度 inputs_length = tf.math.count_nonzero(X_train_batch, 1) # 获得bert的模型输出 model_inputs = bert_model(X_train_batch, attention_mask=att_mask_batch)[0] else: X_train_batch, y_train_batch = data_manager.next_batch( X_train, y_train, start_index=iteration * batch_size) # 计算没有加入pad之前的句子的长度 inputs_length = tf.math.count_nonzero(X_train_batch, 1) model_inputs = X_train_batch with tf.GradientTape() as tape: logits, log_likelihood, transition_params = bilstm_crf_model.call( inputs=model_inputs, inputs_length=inputs_length, targets=y_train_batch, training=1) loss = -tf.reduce_mean(log_likelihood) # 定义好参加梯度的参数 gradients = tape.gradient(loss, bilstm_crf_model.trainable_variables) # 反向传播,自动微分计算 optimizer.apply_gradients( zip(gradients, bilstm_crf_model.trainable_variables)) if iteration % configs.print_per_batch == 0 and iteration != 0: batch_pred_sequence, _ = crf_decode(logits, transition_params, inputs_length) measures, _ = metrics(X_train_batch, y_train_batch, batch_pred_sequence, configs, data_manager, tokenizer) res_str = '' for k, v in measures.items(): res_str += (k + ': %.3f ' % v) logger.info('training batch: %5d, loss: %.5f, %s' % (iteration, loss, res_str)) # validation logger.info('start evaluate engines...') loss_values = [] val_results = {} val_labels_results = {} for label in data_manager.suffix: val_labels_results.setdefault(label, {}) for measure in configs.measuring_metrics: val_results[measure] = 0 for label, content in val_labels_results.items(): for measure in configs.measuring_metrics: val_labels_results[label][measure] = 0 for iteration in tqdm(range(num_val_iterations)): if configs.use_bert: X_val_batch, y_val_batch, att_mask_batch = data_manager.next_batch( X_val, y_val, att_mask_val, iteration * batch_size) inputs_length_val = tf.math.count_nonzero(X_val_batch, 1) # 获得bert的模型输出 model_inputs = bert_model(X_val_batch, attention_mask=att_mask_batch)[0] else: X_val_batch, y_val_batch = data_manager.next_batch( X_val, y_val, iteration * batch_size) inputs_length_val = tf.math.count_nonzero(X_val_batch, 1) model_inputs = X_val_batch logits_val, log_likelihood_val, transition_params_val = bilstm_crf_model.call( inputs=model_inputs, inputs_length=inputs_length_val, targets=y_val_batch) val_loss = -tf.reduce_mean(log_likelihood_val) batch_pred_sequence_val, _ = crf_decode(logits_val, transition_params_val, inputs_length_val) measures, lab_measures = metrics(X_val_batch, y_val_batch, batch_pred_sequence_val, configs, data_manager, tokenizer) for k, v in measures.items(): val_results[k] += v for lab in lab_measures: for k, v in lab_measures[lab].items(): val_labels_results[lab][k] += v loss_values.append(val_loss) time_span = (time.time() - start_time) / 60 val_res_str = '' dev_f1_avg = 0 for k, v in val_results.items(): val_results[k] /= num_val_iterations val_res_str += (k + ': %.3f ' % val_results[k]) if k == 'f1': dev_f1_avg = val_results[k] for label, content in val_labels_results.items(): val_label_str = '' for k, v in content.items(): val_labels_results[label][k] /= num_val_iterations val_label_str += (k + ': %.3f ' % val_labels_results[label][k]) logger.info('label: %s, %s' % (label, val_label_str)) logger.info('time consumption:%.2f(min), %s' % (time_span, val_res_str)) if np.array(dev_f1_avg).mean() > best_f1_val: unprocessed = 0 best_f1_val = np.array(dev_f1_avg).mean() best_at_epoch = i + 1 checkpoint_manager.save() logger.info('saved the new best model with f1: %.3f' % best_f1_val) else: unprocessed += 1 if configs.is_early_stop: if unprocessed >= configs.patient: logger.info( 'early stopped, no progress obtained within {} epochs'. format(configs.patient)) logger.info('overall best f1 is {} at {} epoch'.format( best_f1_val, best_at_epoch)) logger.info('total training time consumption: %.3f(min)' % ((time.time() - very_start_time) / 60)) return logger.info('overall best f1 is {} at {} epoch'.format( best_f1_val, best_at_epoch)) logger.info('total training time consumption: %.3f(min)' % ((time.time() - very_start_time) / 60))
def inner_train_one_step(self, batch, inner_iters, inner_epochNum, outer_epochNum, task_name, log_writer): ''' :param self: :param batch: batches = [S,Q] S,Q: [[sentence],[sentence],....] sentence=[emb:[],chars:[],tags:[],tag_ids:[]] :param inner_epochNum: :return: ''' batches_len = len(batch) # =====run model======= S, Q = batch Q_tag_ids = Q['tag_ids'] S_tag_ids = S['tag_ids'] Q_seq_len_list = Q['lens'] Q_seq_len_list_plus2 = [x + 2 for x in Q_seq_len_list] Q_tag_ids_padded = pad_tag_ids(Q_tag_ids) S_tag_ids_padded = pad_tag_ids(S_tag_ids) Q['tag_ids'] = Q_tag_ids_padded S['tag_ids'] = S_tag_ids_padded with tf.GradientTape() as tape: logits = self([S, Q]) loss = self.crf_loss(logits, Q_tag_ids_padded, Q_seq_len_list_plus2) pred_tags, pred_best_score = crf.crf_decode( potentials=logits, transition_params=self.trans_p, sequence_length=Q_seq_len_list_plus2) grads = tape.gradient(loss, self.trainable_variables) self.optimizer.apply_gradients(zip(grads, self.trainable_variables)) # optimizer.minimize(loss, [myModel_bilstm.trainable_variables]) pred_tags_masked = seq_masking(pred_tags, Q_seq_len_list_plus2) p_tags_char, p_tagsid_flatten = get_id2tag_V2(pred_tags_masked, Q_seq_len_list_plus2, taskname=task_name) t_tags_char, t_tagsid_flatten = get_id2tag_V2(Q_tag_ids_padded, Q_seq_len_list_plus2, taskname=task_name) (P_t, R_t, F1_t), _ = evaluate(t_tags_char, p_tags_char, verbose=False) with log_writer.as_default(): # step = batch_num + 1 + inner_epochNum * batches_len tf.summary.scalar("loss", loss, step=inner_epochNum + outer_epochNum * inner_iters) tf.summary.scalar("P", P_t, step=inner_epochNum + outer_epochNum * inner_iters) tf.summary.scalar("R", R_t, step=inner_epochNum + outer_epochNum * inner_iters) tf.summary.scalar("F", F1_t, step=inner_epochNum + outer_epochNum * inner_iters) return (loss, P_t, R_t, F1_t)
def train_one_epoch(mymodel,optimizer,batches, epoch_num=1, checkpoints_dir='checkpoints00',ckpt_manager=None,log_writer=None): ''' :param mymodel: :param batches: one batch data: [[sentence],[sentence],....] sentence=[[chars],[charids],[tags],[tag_ids]] :param epoch_num: :return: ''' if ckpt_manager is None: checkpoint = tf.train.Checkpoint(optimizer=optimizer, model=mymodel) ckpt_manager = tf.train.CheckpointManager(checkpoint, directory=checkpoints_dir, max_to_keep=10) batch_size = len(batches) print('========================batchsiez',batch_size) # =====run model======= with tqdm(total=batch_size) as bar: for batch_num in range(batch_size): batch = batches[batch_num] seq_ids_padded, tag_ids_padded, seq_len_list = get_train_data_from_batch(batch) with tf.GradientTape() as tape: logits = mymodel(seq_ids_padded) loss = mymodel.crf_loss(logits, tag_ids_padded, seq_len_list) pred_tags, pred_best_score = crf.crf_decode(potentials=logits, transition_params=mymodel.trans_p, sequence_length=seq_len_list) grads = tape.gradient(loss, mymodel.trainable_variables) optimizer.apply_gradients(zip(grads, mymodel.trainable_variables)) # optimizer.minimize(loss, [myModel_bilstm.trainable_variables]) bar.update(1) if batch_num % 2 == 0: pred_tags_masked = seq_masking(pred_tags, seq_len_list) p_tags = findall_tag(pred_tags_masked, seq_len_list) t_tags = findall_tag(tag_ids_padded, seq_len_list) (P_train, R_train, F1_train) = P_R_F1_score(p_tags, t_tags) p_tags_char , p_tagsid_flatten = get_id2tag(pred_tags_masked) t_tags_char,t_tagsid_flatten = get_id2tag(tag_ids_padded) try: P_C, R_C, F1_C = evaluate(t_tags_char,p_tags_char,verbose=True) except Exception as e: print(e) step = batch_num + 1 + epoch_num * batch_size tf.summary.scalar("train_loss", loss, step=step) tf.summary.scalar("P_train", P_train, step=step) tf.summary.scalar("R_train", R_train, step=step) tf.summary.scalar("F1_train", F1_train, step=step) tf.summary.scalar("P_C", P_C, step=step) tf.summary.scalar("R_C", R_C, step=step) tf.summary.scalar("F1_C", F1_C, step=step) print( 'epoch:{}\t\tbatch:{}\t\ttrain_loss:{:.2f}\t\ttrain_P :{:.8f}\t\ttrain_R :{:.8f}\t\ttrain_F1 :{:.8f}\t\t'.format( epoch_num, batch_num, loss, P_train, R_train, F1_train)) ckpt_manager.save(checkpoint_number=epoch_num)