def __init__(self, nb_classes=143, dense_dim=256, max_len=128, batch_size=32, epochs=30, train_corpus_path="data/sent.train", test_corpus_path="data/sent.test", weights_file_path="./model/bertweights_fc.h5"): self.nb_classes = nb_classes self.dense_dim = dense_dim self.max_len = max_len self.batch_size = batch_size self.epochs = epochs self.weights_file_path = weights_file_path self.train_corpus_path = train_corpus_path self.test_corpus_path = test_corpus_path self.nb_samples = 46985 # 样本数 self.bert_model = BertVector( pooling_strategy="REDUCE_MEAN", max_seq_len=self.max_len, bert_model_path= r"D:\赵鲸朋\pycharmModel0905\pycharmModel0905\keras_bert_classification\uncased_L-12_H-768_A-12", graph_tmpfile="./data/output/tmp_graph_xxx")
def __init__(self, nb_classes=2, gru_dim=128, dense_dim=128, max_len=100, batch_size=128, epochs=10, train_corpus_path="data/sent.train", test_corpus_path="data/sent.test", save_weights_file="./model/weights_lstm.h5"): self.nb_classes = nb_classes self.gru_dim = gru_dim self.dense_dim = dense_dim self.max_len = max_len self.batch_size = batch_size self.epochs = epochs self.train_corpus_path=train_corpus_path self.test_corpus_path=test_corpus_path self.save_weights_file = save_weights_file self.nb_samples = 25000 # 样本数 self.bert_model = BertVector(pooling_strategy="NONE", max_seq_len=self.max_len, bert_model_path="./chinese_L-12_H-768_A-12/", 280 graph_tmpfile="./tmp_graph_xxx)
def predict(inputTextList): cnt = 0 print("begin") # 加载训练效果最好的模型 model_dir = './models' files = os.listdir(model_dir) models_path = [os.path.join(model_dir, _) for _ in files] best_model_path = sorted( models_path, key=lambda x: float(x.split('-')[-1].replace('.h5', '')), reverse=True)[0] print("the best model is", best_model_path) model = load_model(best_model_path, custom_objects={"Attention": Attention}) # 利用BERT提取句子特征 bert_model = BertVector(pooling_strategy="NONE", max_seq_len=80) print("the bert model for sentence vector is ready") return_List = [] for inputText in inputTextList: try: per1, per2, doc = inputText.split('#') text = '$'.join([ per1, per2, doc.replace(per1, len(per1) * '#').replace(per2, len(per2) * '#') ]) ''' print("example text:") print(text) ''' vec = bert_model.encode([text])["encodes"][0] x_train = np.array([vec]) # 模型预测并输出预测结果 predicted = model.predict(x_train) y = np.argmax(predicted[0]) with open('data/rel_dict.json', 'r', encoding='utf-8') as f: rel_dict = json.load(f) id_rel_dict = {v: k for k, v in rel_dict.items()} ''' if id_rel_dict[y] != "unknown": print('原文: %s' % inputText) print('预测人物关系: %s' % id_rel_dict[y]) ''' return_List.append([per1, per2, id_rel_dict[y], doc]) except: print("error") return_List.append([per1, per2, "", doc]) continue return return_List
def __init__(self, nb_classes=2, dense_dim=256, max_len=100, batch_size=128, epochs=5, train_corpus_path="data/sent.train", test_corpus_path="data/sent.test", weights_file_path="./model/weights_fc.h5"): self.nb_classes = nb_classes self.dense_dim = dense_dim self.max_len = max_len self.batch_size = batch_size self.epochs = epochs self.weights_file_path = weights_file_path self.train_corpus_path = train_corpus_path self.test_corpus_path = test_corpus_path self.nb_samples = 25000 # 样本数 self.bert_model = BertVector(pooling_strategy="REDUCE_MEAN", max_seq_len=self.max_len)
def __init__(self, nb_classes=3, dense_dim=256, max_len=100, batch_size=128, epochs=50, train_corpus_path="data/train.csv", test_corpus_path="data/dev.csv", weights_file_path="./model/weights_fc.h5"): self.nb_classes = nb_classes self.dense_dim = dense_dim self.max_len = max_len self.batch_size = batch_size self.epochs = epochs self.weights_file_path = weights_file_path self.train_corpus_path = train_corpus_path self.test_corpus_path = test_corpus_path self.nb_samples = 17 # 样本数 D:\NLP项目\bert模型\chinese_L-12_H-768_A-12 self.bert_model = BertVector(pooling_strategy="REDUCE_MEAN", max_seq_len=self.max_len, bert_model_path="chinese_L-12_H-768_A-12", graph_tmpfile="./tmp_graph_xxx")
from keras.models import Model from keras.optimizers import Adam from keras.layers import Input, Dense from keras.callbacks import EarlyStopping from att import Attention from keras.layers import GRU, LSTM, Bidirectional from keras.callbacks import ModelCheckpoint import matplotlib.pyplot as plt from load_data import get_train_test_pd from bert.extract_feature import BertVector # 读取文件并进行转换 train_df, test_df = get_train_test_pd() bert_model = BertVector(pooling_strategy="NONE", max_seq_len=80) print('begin encoding') f = lambda text: bert_model.encode([text])["encodes"][0] train_df['x'] = train_df['text'].apply(f) test_df['x'] = test_df['text'].apply(f) print('end encoding') # 训练集和测试集 x_train = np.array([vec for vec in train_df['x']]) x_test = np.array([vec for vec in test_df['x']]) y_train = np.array([vec for vec in train_df['label']]) y_test = np.array([vec for vec in test_df['label']]) # print('x_train: ', x_train.shape) # 将类型y值转化为ont-hot向量
import pandas as pd import numpy as np from bert.extract_feature import BertVector from keras.models import load_model from InputPassage_OutputSentence import outfile load_model = load_model("model/question_sentence_classify_20000.h5") # 预测语句 f = open(outfile, encoding="UTF-8") texts = [] for line in f.readlines(): texts.append(line.strip()) labels = [] true_posibilities = [] bert_model = BertVector(pooling_strategy="REDUCE_MEAN", max_seq_len=70) # 对上述句子进行预测 for text in texts: # 将句子转换成向量 vec = bert_model.encode([text])["encodes"][0] x_train = np.array([vec]) # 模型预测 predicted = load_model.predict(x_train) # print("predicted:",predicted) y = np.argmax(predicted[0]) print("y:", y) label = '1' if y else '0' true_posibility = predicted[0][1] # if y==1: # posibility = predicted[0][1] # true_posibilities.append(true_posibility)
class BertClassification(object): def __init__(self, nb_classes=2, gru_dim=128, dense_dim=128, max_len=100, batch_size=128, epochs=10, train_corpus_path="data/sent.train", test_corpus_path="data/sent.test", save_weights_file="./model/weights_lstm.h5"): self.nb_classes = nb_classes self.gru_dim = gru_dim self.dense_dim = dense_dim self.max_len = max_len self.batch_size = batch_size self.epochs = epochs self.train_corpus_path=train_corpus_path self.test_corpus_path=test_corpus_path self.save_weights_file = save_weights_file self.nb_samples = 25000 # 样本数 self.bert_model = BertVector(pooling_strategy="NONE", max_seq_len=self.max_len, bert_model_path="./chinese_L-12_H-768_A-12/", 280 graph_tmpfile="./tmp_graph_xxx) def text2bert(self, text): """ 将文本转换为bert向量 """ vec = self.bert_model.encode([text]) return vec["encodes"][0] def data_format(self, lines): X, y = [], [] for line in lines: line = line.strip().split("\t") label = int(line[0]) content = line[1] vec = self.text2bert(content) X.append(vec) y.append(label) X = np.array(X) y = np_utils.to_categorical(np.asarray(y), num_classes=self.nb_classes) return X, y def data_iter(self): """ 数据生成器 """ fr = codecs.open(self.train_corpus_path, "r", "utf-8") lines = fr.readlines() fr.close() random.shuffle(lines) while True: for index in range(0, len(lines), self.batch_size): batch_samples = lines[index: index+self.batch_size] X, y = self.data_format(batch_samples) yield (X, y) def data_val(self): """ 测试数据 """ fr = codecs.open(self.test_corpus_path, "r", "utf-8") lines = fr.readlines() fr.close() random.shuffle(lines) X,y = self.data_format(lines) return X,y def create_model(self): x_in = Input(shape=(self.max_len, 768, )) x_out = Masking(mask_value=0.0)(x_in) x_out = GRU(self.gru_dim, dropout=0.25, recurrent_dropout=0.25)(x_out) x_out = Dense(self.dense_dim, activation="relu")(x_out) x_out = BatchNormalization()(x_out) x_out = Dense(self.nb_classes, activation="softmax")(x_out) model = Model(inputs=x_in, outputs=x_out) return model def train(self): model = self.create_model() model.compile(loss='categorical_crossentropy', optimizer=Adam(lr=0.001), metrics=['accuracy']) checkpoint = ModelCheckpoint(self.save_weights_file, monitor='val_acc', verbose=1, save_best_only=True, mode='max') x_test, y_test = self.data_val() steps_per_epoch = int(self.nb_samples/self.batch_size)+1 model.fit_generator(self.data_iter(), steps_per_epoch=steps_per_epoch, epochs=self.epochs, verbose=1, validation_data=(x_test, y_test), validation_steps=None, callbacks=[checkpoint] )
import pickle # 使用GPU训练 os.environ["CUDA_VISIBLE_DEVICES"] = "4,5,6,7,8" import numpy as np from load_data import train_df, test_df from keras.utils import to_categorical from keras.models import Model from keras.optimizers import Adam from keras.layers import Input, BatchNormalization, Dense from bert.extract_feature import BertVector train_df.loc[train_df['label'] == '3', ['label']] = '0' test_df.loc[test_df['label'] == '3', ['label']] = '0' # 读取文件并进行转换 bert_model = BertVector(pooling_strategy="REDUCE_MEAN", max_seq_len=512) print('begin encoding') f = lambda text: bert_model.encode([text])["encodes"][0] train_df['x'] = train_df['text'].apply(f) test_df['x'] = test_df['text'].apply(f) x_train = np.array([vec for vec in train_df['x']]) x_test = np.array([vec for vec in test_df['x']]) y_train = np.array([vec for vec in train_df['label']]) y_test = np.array([vec for vec in test_df['label']]) print('x_train: ', x_train.shape) # Convert class vectors to binary class matrices. num_classes = 2 y_train = to_categorical(y_train, num_classes)
test['text'] = test['岗位名称'] + test['岗位职责'] print('begin encoding') f = lambda text: bert_model.encode([text])["encodes"][0] test['x'] = test['text'].apply(f) print('end encoding') pred_data = np.array([_ for _ in test['x']]) for ind, model in enumerate(models): if ind != 3: pred = model.predict(pred_data) test['pred%d' % (ind)] = pred else: pred = model.predict(pred_data) y = np.argmax(pred, axis=1) test['pred%d' % (ind)] = y test = test.drop('x', axis=1) test.to_csv('result/output.csv', encoding='gbk') if __name__ == '__main__': bert_model = BertVector(pooling_strategy="REDUCE_MEAN", max_seq_len=400) #embedding(bert_model, 2) #只需在更换训练数据时执行,得到的词向量将保存在data下,无需重复执行 data = load_split(2) #embedding与load_split的参数与训练数据文件名的末尾数字保持一致 model1 = skl_precision(svm.SVC, *data, kernel='rbf') model2 = skl_precision(es.RandomForestClassifier, *data, max_features='sqrt') model3 = skl_precision(nb.GaussianNB, *data) model4 = mlp_precision(*data) test_model(bert_model, [model1, model2, model3, model4])
class BertClassification(object): def __init__(self, nb_classes=143, dense_dim=256, max_len=128, batch_size=32, epochs=30, train_corpus_path="data/sent.train", test_corpus_path="data/sent.test", weights_file_path="./model/bertweights_fc.h5"): self.nb_classes = nb_classes self.dense_dim = dense_dim self.max_len = max_len self.batch_size = batch_size self.epochs = epochs self.weights_file_path = weights_file_path self.train_corpus_path = train_corpus_path self.test_corpus_path = test_corpus_path self.nb_samples = 46985 # 样本数 self.bert_model = BertVector( pooling_strategy="REDUCE_MEAN", max_seq_len=self.max_len, bert_model_path= r"D:\赵鲸朋\pycharmModel0905\pycharmModel0905\keras_bert_classification\uncased_L-12_H-768_A-12", graph_tmpfile="./data/output/tmp_graph_xxx") def text2bert(self, text): """ 将文本转换为bert向量 """ vec = self.bert_model.encode([text]) return vec["encodes"][0] ############################################################################################# ############################################################################################# def data_format(self, lines): """ 将数据转换为训练格式,输入为列表 """ X, y = [], [] for line in lines: line = line.strip().split("\t") # label = int(line[0]) label = wosy2_to_id[line[0]] content = line[1] vec = self.text2bert(content) X.append(vec) y.append(label) X = np.array(X) y = np_utils.to_categorical(np.asarray(y), num_classes=self.nb_classes) return X, y def data_iter(self): """ 数据生成器 """ # fr = codecs.open(self.train_corpus_path, "r", "utf-8") # lines = fr.readlines() # fr.close() # random.shuffle(lines) lines = train_labcont while True: for index in range(0, len(lines), self.batch_size): batch_samples = lines[index:index + self.batch_size] X, y = self.data_format(batch_samples) yield (X, y) def data_val(self): """ 测试数据 """ # fr = codecs.open(self.test_corpus_path, "r", "utf-8") # lines = fr.readlines() # fr.close() # random.shuffle(lines) lines = test_labcont X, y = self.data_format(lines) return X, y def create_model(self): x_in = Input(shape=(768, )) # tanh x_out = Dense(self.dense_dim, activation="relu")(x_in) x_out = BatchNormalization()(x_out) x_out = Dense(self.nb_classes, activation="softmax")(x_out) model = Model(inputs=x_in, outputs=x_out) return model def train(self): model = self.create_model() model.compile(loss='categorical_crossentropy', optimizer=Adam(), metrics=['accuracy']) model.summary() checkpoint = ModelCheckpoint(self.weights_file_path, monitor='val_acc', verbose=1, save_best_only=True, mode='max') early_stopping = EarlyStopping(monitor='val_acc', patience=2, mode='max') x_test, y_test = self.data_val() model.fit_generator( self.data_iter(), steps_per_epoch=int(self.nb_samples / self.batch_size) + 1, epochs=self.epochs, verbose=1, validation_data=(x_test, y_test), validation_steps=None, callbacks=[checkpoint, early_stopping])
#coding:utf-8 from bert.extract_feature import BertVector pooling_strategy = "REDUCE_MEAN" #pooling_strategy = "NONE" bc = BertVector(pooling_strategy=pooling_strategy, max_seq_len=80) s1 = '人 同 去 福田 图书馆 啊 在 家 写 作业 巨 没 feel , 我 的 作业' s2 = "人同去福田图书馆啊在家写作业巨没feel,我的作业" v = bc.encode([s1]) v1 = v["encodes"][0] print(v1) v = bc.encode([s2]) v2 = v["encodes"][0] print(v2)
class BertClassification(object): def __init__(self, nb_classes=3, dense_dim=256, max_len=100, batch_size=128, epochs=50, train_corpus_path="data/train.csv", test_corpus_path="data/dev.csv", weights_file_path="./model/weights_fc.h5"): self.nb_classes = nb_classes self.dense_dim = dense_dim self.max_len = max_len self.batch_size = batch_size self.epochs = epochs self.weights_file_path = weights_file_path self.train_corpus_path = train_corpus_path self.test_corpus_path = test_corpus_path self.nb_samples = 17 # 样本数 D:\NLP项目\bert模型\chinese_L-12_H-768_A-12 self.bert_model = BertVector(pooling_strategy="REDUCE_MEAN", max_seq_len=self.max_len, bert_model_path="chinese_L-12_H-768_A-12", graph_tmpfile="./tmp_graph_xxx") def text2bert(self, text): """ 将文本转换为bert向量 """ vec = self.bert_model.encode([text]) return vec["encodes"][0] def data_format(self, lines): """ 将数据转换为训练格式,输入为列表 """ X, y = [], [] for line in lines: line = line.strip().split(",") try: label = int(line[4]) content = line[2] vec = self.text2bert(content) X.append(vec) y.append(label) except: print(line[0]) X = np.array(X) y = np_utils.to_categorical(np.asarray(y), num_classes=self.nb_classes) return X, y def data_iter(self): """ 数据生成器 """ fr = codecs.open(self.train_corpus_path, "r", "utf-8") # 训练集在这里 lines = fr.readlines() fr.close() random.shuffle(lines) while True: for index in range(0, len(lines), self.batch_size): batch_samples = lines[index:index + self.batch_size] X, y = self.data_format(batch_samples) yield (X, y) def data_val(self): """ 测试数据 """ fr = codecs.open(self.test_corpus_path, "r", "utf-8") lines = fr.readlines() fr.close() random.shuffle(lines) X, y = self.data_format(lines) return X, y def create_model(self): x_in = Input(shape=(768, )) x_out = Dense(self.dense_dim, activation="relu")(x_in) x_out = BatchNormalization()(x_out) x_out = Dense(self.nb_classes, activation="softmax")(x_out) # 这里是分类 model = Model(inputs=x_in, outputs=x_out) return model def train(self): model = self.create_model() model.compile(loss='categorical_crossentropy', optimizer=Adam(), metrics=['accuracy']) checkpoint = ModelCheckpoint(self.weights_file_path, monitor='val_acc', verbose=1, save_best_only=True, mode='max') x_test, y_test = self.data_val() model.fit_generator( self.data_iter(), # 训练集数据迭代器 steps_per_epoch=int(self.nb_samples / self.batch_size) + 1, # batch_size=128 epochs=self.epochs, verbose=1, validation_data=(x_test, y_test), validation_steps=None, callbacks=[checkpoint]) pred = model.predict(x_test) pred = [np.argmax(val) for val in pred] print(pred) # [1, 0, 1, 0, 2, 2, 0, 1, 2, 2, 0, 2, 0, 0, 2, 2, 1] y_true = [] for val in y_test: y_true.append(np.argmax(val)) print(y_true) p = precision_score(y_true, pred, average='macro') r = recall_score(y_true, pred, average='macro') f1 = f1_score(y_true, pred, average='macro') print(p) print(r) print(f1)
from bert.extract_feature import BertVector ## 将原始数据集打乱,并分成训练集和验证集(本次实验从原始样本中,选取200个作为训练集,10000个作为验证集) idxs = np.random.randint(0, len(texts), size=10200) X = [] y = [] for id in idxs: X.append(texts[id]) y.append(labels[id]) X_VEC = [] ## 使用BERT进行向量化 print("star encoding...") bert_model = BertVector(pooling_strategy="NONE", max_seq_len=100) for text in X: X_VEC.append(bert_model.encode([text])["encodes"][0]) X_VEC_CLS = [] for vec in X_VEC: X_VEC_CLS.append(vec[0]) x_train = np.array(X_VEC_CLS[:8000]) x_test = np.array(X_VEC_CLS[8000:]) y_train = np.array(y[:8000]) y_test = np.array(y[8000:]) # 训练集测试集构建完成,开始准备构建模型 print("star training...")
"變種病毒": 100, "輝瑞疫苗": 100, "蠟筆小新": 100, "新型流感": 1, "冠狀病毒": 1, "武肺": 1, "指揮中心": 1, "口罩": 1, "酷碰券": 100, "蔡英文": 100, "蔡政府": 100, } dictionary = construct_dictionary(word_to_weight) # load model bv = BertVector() ws = WS('data') pos = POS('data') ner = NER('data') #sentiment_score score_s = sentiment("train_done.csv") # max_len max_head_len = 150 # load data df_train = cat_replace(pd.read_csv('train_done.csv')).drop_duplicates( subset=['claim']).dropna() df_test = cat_replace(pd.read_csv('test_done.csv')).drop_duplicates( subset=['claim']).dropna()