def predict(inputTextList): cnt = 0 print("begin") # 加载训练效果最好的模型 model_dir = './models' files = os.listdir(model_dir) models_path = [os.path.join(model_dir, _) for _ in files] best_model_path = sorted( models_path, key=lambda x: float(x.split('-')[-1].replace('.h5', '')), reverse=True)[0] print("the best model is", best_model_path) model = load_model(best_model_path, custom_objects={"Attention": Attention}) # 利用BERT提取句子特征 bert_model = BertVector(pooling_strategy="NONE", max_seq_len=80) print("the bert model for sentence vector is ready") return_List = [] for inputText in inputTextList: try: per1, per2, doc = inputText.split('#') text = '$'.join([ per1, per2, doc.replace(per1, len(per1) * '#').replace(per2, len(per2) * '#') ]) ''' print("example text:") print(text) ''' vec = bert_model.encode([text])["encodes"][0] x_train = np.array([vec]) # 模型预测并输出预测结果 predicted = model.predict(x_train) y = np.argmax(predicted[0]) with open('data/rel_dict.json', 'r', encoding='utf-8') as f: rel_dict = json.load(f) id_rel_dict = {v: k for k, v in rel_dict.items()} ''' if id_rel_dict[y] != "unknown": print('原文: %s' % inputText) print('预测人物关系: %s' % id_rel_dict[y]) ''' return_List.append([per1, per2, id_rel_dict[y], doc]) except: print("error") return_List.append([per1, per2, "", doc]) continue return return_List
from keras.layers import Input, Dense from keras.callbacks import EarlyStopping from att import Attention from keras.layers import GRU, LSTM, Bidirectional from keras.callbacks import ModelCheckpoint import matplotlib.pyplot as plt from load_data import get_train_test_pd from bert.extract_feature import BertVector # 读取文件并进行转换 train_df, test_df = get_train_test_pd() bert_model = BertVector(pooling_strategy="NONE", max_seq_len=80) print('begin encoding') f = lambda text: bert_model.encode([text])["encodes"][0] train_df['x'] = train_df['text'].apply(f) test_df['x'] = test_df['text'].apply(f) print('end encoding') # 训练集和测试集 x_train = np.array([vec for vec in train_df['x']]) x_test = np.array([vec for vec in test_df['x']]) y_train = np.array([vec for vec in train_df['label']]) y_test = np.array([vec for vec in test_df['label']]) # print('x_train: ', x_train.shape) # 将类型y值转化为ont-hot向量 num_classes = 14 y_train = to_categorical(y_train, num_classes)
from InputPassage_OutputSentence import outfile load_model = load_model("model/question_sentence_classify_20000.h5") # 预测语句 f = open(outfile, encoding="UTF-8") texts = [] for line in f.readlines(): texts.append(line.strip()) labels = [] true_posibilities = [] bert_model = BertVector(pooling_strategy="REDUCE_MEAN", max_seq_len=70) # 对上述句子进行预测 for text in texts: # 将句子转换成向量 vec = bert_model.encode([text])["encodes"][0] x_train = np.array([vec]) # 模型预测 predicted = load_model.predict(x_train) # print("predicted:",predicted) y = np.argmax(predicted[0]) print("y:", y) label = '1' if y else '0' true_posibility = predicted[0][1] # if y==1: # posibility = predicted[0][1] # true_posibilities.append(true_posibility) # else: # false_posibility=predicted[0][0] # false_posibilities.append(false_posibility) labels.append(label)
class BertClassification(object): def __init__(self, nb_classes=2, gru_dim=128, dense_dim=128, max_len=100, batch_size=128, epochs=10, train_corpus_path="data/sent.train", test_corpus_path="data/sent.test", save_weights_file="./model/weights_lstm.h5"): self.nb_classes = nb_classes self.gru_dim = gru_dim self.dense_dim = dense_dim self.max_len = max_len self.batch_size = batch_size self.epochs = epochs self.train_corpus_path=train_corpus_path self.test_corpus_path=test_corpus_path self.save_weights_file = save_weights_file self.nb_samples = 25000 # 样本数 self.bert_model = BertVector(pooling_strategy="NONE", max_seq_len=self.max_len, bert_model_path="./chinese_L-12_H-768_A-12/", 280 graph_tmpfile="./tmp_graph_xxx) def text2bert(self, text): """ 将文本转换为bert向量 """ vec = self.bert_model.encode([text]) return vec["encodes"][0] def data_format(self, lines): X, y = [], [] for line in lines: line = line.strip().split("\t") label = int(line[0]) content = line[1] vec = self.text2bert(content) X.append(vec) y.append(label) X = np.array(X) y = np_utils.to_categorical(np.asarray(y), num_classes=self.nb_classes) return X, y def data_iter(self): """ 数据生成器 """ fr = codecs.open(self.train_corpus_path, "r", "utf-8") lines = fr.readlines() fr.close() random.shuffle(lines) while True: for index in range(0, len(lines), self.batch_size): batch_samples = lines[index: index+self.batch_size] X, y = self.data_format(batch_samples) yield (X, y) def data_val(self): """ 测试数据 """ fr = codecs.open(self.test_corpus_path, "r", "utf-8") lines = fr.readlines() fr.close() random.shuffle(lines) X,y = self.data_format(lines) return X,y def create_model(self): x_in = Input(shape=(self.max_len, 768, )) x_out = Masking(mask_value=0.0)(x_in) x_out = GRU(self.gru_dim, dropout=0.25, recurrent_dropout=0.25)(x_out) x_out = Dense(self.dense_dim, activation="relu")(x_out) x_out = BatchNormalization()(x_out) x_out = Dense(self.nb_classes, activation="softmax")(x_out) model = Model(inputs=x_in, outputs=x_out) return model def train(self): model = self.create_model() model.compile(loss='categorical_crossentropy', optimizer=Adam(lr=0.001), metrics=['accuracy']) checkpoint = ModelCheckpoint(self.save_weights_file, monitor='val_acc', verbose=1, save_best_only=True, mode='max') x_test, y_test = self.data_val() steps_per_epoch = int(self.nb_samples/self.batch_size)+1 model.fit_generator(self.data_iter(), steps_per_epoch=steps_per_epoch, epochs=self.epochs, verbose=1, validation_data=(x_test, y_test), validation_steps=None, callbacks=[checkpoint] )
class BertClassification(object): def __init__(self, nb_classes=143, dense_dim=256, max_len=128, batch_size=32, epochs=30, train_corpus_path="data/sent.train", test_corpus_path="data/sent.test", weights_file_path="./model/bertweights_fc.h5"): self.nb_classes = nb_classes self.dense_dim = dense_dim self.max_len = max_len self.batch_size = batch_size self.epochs = epochs self.weights_file_path = weights_file_path self.train_corpus_path = train_corpus_path self.test_corpus_path = test_corpus_path self.nb_samples = 46985 # 样本数 self.bert_model = BertVector( pooling_strategy="REDUCE_MEAN", max_seq_len=self.max_len, bert_model_path= r"D:\赵鲸朋\pycharmModel0905\pycharmModel0905\keras_bert_classification\uncased_L-12_H-768_A-12", graph_tmpfile="./data/output/tmp_graph_xxx") def text2bert(self, text): """ 将文本转换为bert向量 """ vec = self.bert_model.encode([text]) return vec["encodes"][0] ############################################################################################# ############################################################################################# def data_format(self, lines): """ 将数据转换为训练格式,输入为列表 """ X, y = [], [] for line in lines: line = line.strip().split("\t") # label = int(line[0]) label = wosy2_to_id[line[0]] content = line[1] vec = self.text2bert(content) X.append(vec) y.append(label) X = np.array(X) y = np_utils.to_categorical(np.asarray(y), num_classes=self.nb_classes) return X, y def data_iter(self): """ 数据生成器 """ # fr = codecs.open(self.train_corpus_path, "r", "utf-8") # lines = fr.readlines() # fr.close() # random.shuffle(lines) lines = train_labcont while True: for index in range(0, len(lines), self.batch_size): batch_samples = lines[index:index + self.batch_size] X, y = self.data_format(batch_samples) yield (X, y) def data_val(self): """ 测试数据 """ # fr = codecs.open(self.test_corpus_path, "r", "utf-8") # lines = fr.readlines() # fr.close() # random.shuffle(lines) lines = test_labcont X, y = self.data_format(lines) return X, y def create_model(self): x_in = Input(shape=(768, )) # tanh x_out = Dense(self.dense_dim, activation="relu")(x_in) x_out = BatchNormalization()(x_out) x_out = Dense(self.nb_classes, activation="softmax")(x_out) model = Model(inputs=x_in, outputs=x_out) return model def train(self): model = self.create_model() model.compile(loss='categorical_crossentropy', optimizer=Adam(), metrics=['accuracy']) model.summary() checkpoint = ModelCheckpoint(self.weights_file_path, monitor='val_acc', verbose=1, save_best_only=True, mode='max') early_stopping = EarlyStopping(monitor='val_acc', patience=2, mode='max') x_test, y_test = self.data_val() model.fit_generator( self.data_iter(), steps_per_epoch=int(self.nb_samples / self.batch_size) + 1, epochs=self.epochs, verbose=1, validation_data=(x_test, y_test), validation_steps=None, callbacks=[checkpoint, early_stopping])
#coding:utf-8 from bert.extract_feature import BertVector pooling_strategy = "REDUCE_MEAN" #pooling_strategy = "NONE" bc = BertVector(pooling_strategy=pooling_strategy, max_seq_len=80) s1 = '人 同 去 福田 图书馆 啊 在 家 写 作业 巨 没 feel , 我 的 作业' s2 = "人同去福田图书馆啊在家写作业巨没feel,我的作业" v = bc.encode([s1]) v1 = v["encodes"][0] print(v1) v = bc.encode([s2]) v2 = v["encodes"][0] print(v2)
class BertClassification(object): def __init__(self, nb_classes=3, dense_dim=256, max_len=100, batch_size=128, epochs=50, train_corpus_path="data/train.csv", test_corpus_path="data/dev.csv", weights_file_path="./model/weights_fc.h5"): self.nb_classes = nb_classes self.dense_dim = dense_dim self.max_len = max_len self.batch_size = batch_size self.epochs = epochs self.weights_file_path = weights_file_path self.train_corpus_path = train_corpus_path self.test_corpus_path = test_corpus_path self.nb_samples = 17 # 样本数 D:\NLP项目\bert模型\chinese_L-12_H-768_A-12 self.bert_model = BertVector(pooling_strategy="REDUCE_MEAN", max_seq_len=self.max_len, bert_model_path="chinese_L-12_H-768_A-12", graph_tmpfile="./tmp_graph_xxx") def text2bert(self, text): """ 将文本转换为bert向量 """ vec = self.bert_model.encode([text]) return vec["encodes"][0] def data_format(self, lines): """ 将数据转换为训练格式,输入为列表 """ X, y = [], [] for line in lines: line = line.strip().split(",") try: label = int(line[4]) content = line[2] vec = self.text2bert(content) X.append(vec) y.append(label) except: print(line[0]) X = np.array(X) y = np_utils.to_categorical(np.asarray(y), num_classes=self.nb_classes) return X, y def data_iter(self): """ 数据生成器 """ fr = codecs.open(self.train_corpus_path, "r", "utf-8") # 训练集在这里 lines = fr.readlines() fr.close() random.shuffle(lines) while True: for index in range(0, len(lines), self.batch_size): batch_samples = lines[index:index + self.batch_size] X, y = self.data_format(batch_samples) yield (X, y) def data_val(self): """ 测试数据 """ fr = codecs.open(self.test_corpus_path, "r", "utf-8") lines = fr.readlines() fr.close() random.shuffle(lines) X, y = self.data_format(lines) return X, y def create_model(self): x_in = Input(shape=(768, )) x_out = Dense(self.dense_dim, activation="relu")(x_in) x_out = BatchNormalization()(x_out) x_out = Dense(self.nb_classes, activation="softmax")(x_out) # 这里是分类 model = Model(inputs=x_in, outputs=x_out) return model def train(self): model = self.create_model() model.compile(loss='categorical_crossentropy', optimizer=Adam(), metrics=['accuracy']) checkpoint = ModelCheckpoint(self.weights_file_path, monitor='val_acc', verbose=1, save_best_only=True, mode='max') x_test, y_test = self.data_val() model.fit_generator( self.data_iter(), # 训练集数据迭代器 steps_per_epoch=int(self.nb_samples / self.batch_size) + 1, # batch_size=128 epochs=self.epochs, verbose=1, validation_data=(x_test, y_test), validation_steps=None, callbacks=[checkpoint]) pred = model.predict(x_test) pred = [np.argmax(val) for val in pred] print(pred) # [1, 0, 1, 0, 2, 2, 0, 1, 2, 2, 0, 2, 0, 0, 2, 2, 1] y_true = [] for val in y_test: y_true.append(np.argmax(val)) print(y_true) p = precision_score(y_true, pred, average='macro') r = recall_score(y_true, pred, average='macro') f1 = f1_score(y_true, pred, average='macro') print(p) print(r) print(f1)
## 将原始数据集打乱,并分成训练集和验证集(本次实验从原始样本中,选取200个作为训练集,10000个作为验证集) idxs = np.random.randint(0, len(texts), size=10200) X = [] y = [] for id in idxs: X.append(texts[id]) y.append(labels[id]) X_VEC = [] ## 使用BERT进行向量化 print("star encoding...") bert_model = BertVector(pooling_strategy="NONE", max_seq_len=100) for text in X: X_VEC.append(bert_model.encode([text])["encodes"][0]) X_VEC_CLS = [] for vec in X_VEC: X_VEC_CLS.append(vec[0]) x_train = np.array(X_VEC_CLS[:8000]) x_test = np.array(X_VEC_CLS[8000:]) y_train = np.array(y[:8000]) y_test = np.array(y[8000:]) # 训练集测试集构建完成,开始准备构建模型 print("star training...") from keras.models import Sequential from keras.layers import Dense
# -*- coding: utf-8 -*- # author: Jclian91 # place: Pudong Shanghai # time: 2020-02-12 12:45 from bert.extract_feature import BertVector bert_model = BertVector(pooling_strategy="REDUCE_MEAN", max_seq_len=100) import time t1 = time.time() for _ in range(100): print(_) text = ['英国苏格兰政府首席大臣、苏格兰民族党党魁妮古拉·斯特金11日在伦敦说,苏格兰人应有权重新选择是否独立。'] * 1000 vec = bert_model.encode(text)["encodes"][0] t2 = time.time() print(t2 - t1)