#from bert.extract_feature import BertVector from albert_zh.extract_feature import BertVector from dataset_pro import read_dictionary, random_embedding, label_id, read_data, data_generate from dataset import data_trans MAX_SEQ_LEN = 200 #训练集中最长语句长度为1080 # 读取训练集,验证集和测试集原始数据 label2id = label_id() train_data = read_data("dataset_pro/train.csv") dev_data = read_data("dataset_pro/dev.csv") test_data = read_data("dataset_pro/test.csv") word2id = read_dictionary("dataset_pro/train.pkl") _, origin_train_X, origin_train_y = data_trans('dataset/train.txt') _, origin_dev_X, origin_dev_y = data_trans('dataset/dev.txt') _, origin_test_X, origin_test_y = data_trans('dataset/test.txt') train_sent = [] train_tag = [] for (sent_, tag_) in train_data: train_sent.append(''.join(sent_)) train_tag.append(tag_) dev_sent = [] dev_tag = [] for (sent_, tag_) in dev_data: dev_sent.append(''.join(sent_)) dev_tag.append(tag_)
def load_data(filename): new_list, text_list, tag_list = data_trans(filename) data = spo_generate(new_list, text_list) return data
from model import w2v from model import label_id_dict model = load_model('lstm_crf_ner_0610_3.h5', custom_objects = {"CRF": CRF, 'crf_loss': crf_loss, 'crf_viterbi_accuracy': crf_viterbi_accuracy}) _, _, test_x, _ = w2v() id_label_dict = {v:k for k,v in label_id_dict.items()} y = np.argmax(model.predict(test_x), axis=2) pred_tags = [] for i in range(y.shape[0]): pred_tags.append([id_label_dict[_] for _ in y[i] if _]) # 因为存在预测的标签长度与原来的标注长度不一致的情况,因此需要调整预测的标签 test_sents, test_tags = data_trans('dataset/test.txt') final_tags = [] for test_tag, pred_tag in zip(test_tags, pred_tags): if len(test_tag) == len(pred_tag): final_tags.append(pred_tag) elif len(test_tag) < len(pred_tag): final_tags.append(pred_tag[:len(test_tag)]) else: final_tags.append(pred_tag + ['O'] * (len(test_tag) - len(pred_tag))) # 利用seqeval对测试集进行验证 print(classification_report(test_tags, final_tags, digits=4))