def create_model(self, ): self.bert_embedding.processor.add_bos_eos = False model = BLSTMModel(embedding=self.bert_embedding) model.fit(valid_x, valid_y, epochs=1) res = model.predict(valid_x[:20]) print(res) return model
def test_classification_eval_callback(self): train_x, train_y = SMP2018ECDTCorpus.load_data() test_x, test_y = SMP2018ECDTCorpus.load_data('test') train_x = train_x[:1000] train_y = train_y[:1000] model = BLSTMModel() eval_callback = callbacks.EvalCallBack(model, test_x, test_y, step=1) model.fit(train_x, train_y, callbacks=[eval_callback], epochs=1)
def test_word2vec_embedding(self): embedding = WordEmbeddings('sgns.weibo.bigram', sequence_length=30, limit=5000) self.prepare_model(embedding) self.model = BLSTMModel(embedding=embedding) self.model.fit(self.x_data, self.y_data, x_validate=self.x_eval, y_validate=self.y_eval) sentence = list('语言学包含了几种分支领域。') logging.info(self.model.embedding.tokenize(sentence)) logging.info(self.model.predict(sentence)) self.assertTrue(isinstance(self.model.predict(sentence), str)) self.assertTrue(isinstance(self.model.predict([sentence]), list))
def interview(self): model = BLSTMModel.load_model("../健康管理师分字BERT-model") x_items, train_y = self.read_message('../data/健康管理师分类数据集/test.txt') model.evaluate(x_items, train_y) results_string: str = '' train_string: str = '' for i in x_items: results_string += model.predict(i) for j in train_y: train_string += j if len(results_string) == len(train_string): print('预测结果', results_string, '正确结果', train_string) print('五个五个去判断 全等就是做对了不全等就是做错了') a = len(train_string) b: float = a / 5 right: int = 0 for i in range(int(b)): var = results_string[5 * i:5 * (i + 1)] result = train_string[5 * i:5 * (i + 1)] if var == result: print('做对了') right += 1 else: print('做错了', var, result) acc = b - right print('正确答案', right, '错误答案', acc) print('准确率', right / b)
def interview(self): model = BLSTMModel.load_model("../西药执业药师-model") x_items, train_y = self.read_message('../data/西药执业药师/test.txt') model.evaluate(x_items, train_y) results_string: str = '' train_string: str = '' for i in x_items: results_string += model.predict(i) for j in train_y: train_string += j if len(results_string) == len(train_string): print('五个五个去判断 全等就是做对了不全等就是做错了') a = len(train_string) b: int = int(a / 5) print('验证数据集长度', b) right: int = 0 for i in range(int(b)): single = x_items[b * 5:b * 5 + 1] var = results_string[5 * i:5 * (i + 1)] result = train_string[5 * i:5 * (i + 1)] if var == result: print('做对了', single) right += 1 else: print('做错了', var, result, single) acc = b - right print('正确答案', right, '错误答案', acc) print('准确率', right / b)
def train(self): x_items, train_y, valid_x, valid_y = self.read_message('car/train.csv') # 获取bert字向量 model = BLSTMModel(bert) # 输入模型训练数据 标签 步数 model.fit(x_items, train_y, valid_x, valid_y, batch_size=64, epochs=12, callbacks=[tf_board_callback]) # 保存模型 file = pd.read_csv("car/test.csv", encoding='utf-8').values.tolist() test_data = [] id_list = [] for i in file: test_data.append(list(str(i[1]) + str(i[2]))) id_list.append(i[0]) predict_answers = model.predict(x_data=test_data) file = open("data/test_predict_bert_car.csv", 'w', encoding='utf-8') file.write("id,flag\n") for i, j in zip(id_list, predict_answers): i = i.strip() file.write(str(i) + "," + str(j) + "\n") model.save("../model/news-classification-bert-model")
def test_save_and_load(self): self.test_fit() model_path = tempfile.gettempdir() self.model.save(model_path) new_model = BLSTMModel.load_model(model_path) self.assertIsNotNone(new_model) sentence = list('语言学包含了几种分支领域。') result = new_model.predict(sentence) self.assertTrue(isinstance(result, str))
def test_save_and_load(self): self.test_fit() model_path = os.path.join(tempfile.gettempdir(), 'kashgari_model', str(time.time())) self.model.save(model_path) new_model = BLSTMModel.load_model(model_path) assert new_model is not None sentence = list('语言学包含了几种分支领域。') result = new_model.predict(sentence) assert isinstance(result, str)
def interview(self): model = BLSTMModel.load_model("../健康管理师单选分字BERT-model") x_items, train_y = self.read_message('../data/yingyangshi/test.txt') x_full = self.full_message('../data/yingyangshi/test.txt') model.evaluate(x_items, train_y) results_string: str = '' train_string: str = '' right_predict: list = [] wrong_predict: list = [] for i in x_items: results_string += model.predict(i) for j in train_y: train_string += j if len(results_string) == len(train_string): print('预测结果', results_string, '正确结果', train_string) print('五个五个去判断 全等就是做对了不全等就是做错了') a = len(train_string) b: int = int(a / 5) print('验证数据集长度', b) right: int = 0 for i in range(b): start_single: int = i * 5 end_single: int = (i + 1) * 5 single = x_full[start_single:end_single] var = results_string[start_single:end_single] result = train_string[start_single:end_single] if var == result: print('做对了') right_predict.append(single) for varey in single: print(varey) right += 1 else: print('做错了', var, result) wrong_predict.append(single) for varey in single: print(varey) acc = b - right with open('wrong single.csv', 'w', newline='', encoding='utf-8') as csv_file: csv_writer = csv.writer(csv_file) for wrong_list in wrong_predict: for message in wrong_list: wrong_list = message.split('\t') csv_writer.writerow(wrong_list) with open('right single.csv', 'w', newline='', encoding='utf-8') as csv_file: csv_writer = csv.writer(csv_file) for right_list in right_predict: for message in right_list: message = message.split('\t') csv_writer.writerow(message) print('正确答案', right, '错误答案', acc) print('准确率', right / b)
def test_dataset(model_dir: str) -> list: # 从数据库中获取正文并使用模型进行预测分类, # 预测结果写回数据库 conn = pymysql.connect(host=DB_HOST, port=int(DB_PORT), user=DB_USER, password=DB_PASS, db=DB_NAME, charset=DB_CHARSET ) cursor = conn.cursor() cursor.execute(""" SELECT `page_text`,`page_title`,`category`,`hash` FROM `webpage_text` WHERE `%s_predict` IS NULL ORDER BY `time` desc """ % model_dir.split('.model')[0].split('/')[-1] ) all_text = [] data = cursor.fetchall() # 判断预测使用的模型 if 'cnn.model' in model_dir: model = CNNModel.load_model(model_dir) elif 'cnnlstm.model' in model_dir: model = CNNLSTMModel.load_model(model_dir) elif 'blstm.model' in model_dir: model = BLSTMModel.load_model(model_dir) for i in tqdm.tqdm(data): label = i[2] # 将文章分词,拼接标题与正文 content = strip_stopwords(list(jieba.cut(i[0] + '。' + i[1]))) all_text += content predict = model.predict(content) cursor.execute( 'UPDATE `webpage_text` SET {model}_predict="{predict}"'.format(model=model_dir.split('.model')[0].split('/')[-1],predict=predict)+ 'WHERE hash="%s"' % i[3] ) conn.commit() # print('[+] Predict:'+predict+', Label:'+label+', Title:'+i[1]) # 计算词频并将排行前100的热词写入数据库 c = Counter(all_text) i = 1 cursor.execute( 'DELETE FROM `hot_key` WHERE 1=1' ) conn.commit() for k,v in c.most_common(100): if len(k) == 1: continue cursor.execute( 'INSERT INTO `hot_key` VALUES ({0}, "{1}", {2})'.format(i, k, v) ) conn.commit() i += 1 print('[+] Success')
def test_multi_label_model(self): multi_label_model = self.model_class(multi_label=True) multi_label_model.fit(train_x, train_multi_y, eval_x, eval_multi_y, epochs=2) assert isinstance(multi_label_model.predict(train_x[0]), tuple) model_path = os.path.join(tempfile.gettempdir(), 'kashgari_model', str(time.time())) multi_label_model.save(model_path) new_model = BLSTMModel.load_model(model_path) assert new_model is not None sentence = list('语言学包含了几种分支领域。') result = new_model.predict(sentence) assert isinstance(result, tuple)
def test_bert_model(self): embedding = BERTEmbedding(bert_path, task=kashgari.CLASSIFICATION, sequence_length=100) model = BLSTMModel(embedding=embedding) model.fit(valid_x, valid_y, epochs=1) res = model.predict(valid_x[:20]) assert True model_path = os.path.join(tempfile.gettempdir(), str(time.time())) model.save(model_path) new_model = kashgari.utils.load_model(model_path) new_res = new_model.predict(valid_x[:20]) assert np.array_equal(new_res, res)
def interview(self): model = BLSTMModel.load_model( "../model/health_manager_multi_bert-model") x_items, train_y = read_message( '../data/health_manager_v2/multiple-choice.csv') model.evaluate(x_items, train_y) results_string: str = '' train_string: str = '' right_predict: list = [] wrong_predict: list = [] for i in x_items: results_string += model.predict(i) for j in train_y: train_string += j if len(results_string) == len(train_string): print('预测结果', results_string, '正确结果', train_string) print('五个五个去判断 全等就是做对了不全等就是做错了') a = len(train_string) b: int = int(a / 5) print('验证数据集长度', b) right: int = 0 for i in range(b): start_single: int = i * 5 end_single: int = (i + 1) * 5 var = results_string[start_single:end_single] result = train_string[start_single:end_single] if var == result: print('做对了') acc = b - right with open('wrong single.csv', 'w', newline='', encoding='utf-8') as csv_file: csv_writer = csv.writer(csv_file) for wrong_list in wrong_predict: for message in wrong_list: wrong_list = message.split('\t') csv_writer.writerow(wrong_list) with open('right single.csv', 'w', newline='', encoding='utf-8') as csv_file: csv_writer = csv.writer(csv_file) for right_list in right_predict: for message in right_list: message = message.split('\t') csv_writer.writerow(message) print('正确答案', right, '错误答案', acc) print('准确率', right / b)
def train(self): # filepath = "saved-model-{epoch:02d}-{acc:.2f}.hdf5" # checkpoint_callback = ModelCheckpoint(filepath, # monitor='acc', # verbose=1) x_items, train_y = self.read_message('../data/yingyangshi/train.txt') x_dev, dev_y = self.read_message('../data/yingyangshi/dev.txt') # 获取bert字向量 bert = BERTEmbedding('textclassfation/input0/chinese_L-12_H-768_A-12') model = BLSTMModel(bert) # model.build_multi_gpu_model(gpus=2) model.fit(x_items, train_y, x_dev, dev_y, epochs=2, batch_size=64) # 保存模型 model.save("../健康管理师单选分字BERT-model")
def train(self): x_items, train_y = self.read_message('../data/Chinese medicine licensed pharmacist/train.txt') x_dev, dev_y = self.read_message('../data/Chinese medicine licensed pharmacist/dev.txt') # 获取bert字向量 model = BLSTMModel() # 输入模型训练数据 标签 步数 model.fit(x_items, train_y, x_dev, dev_y, batch_size=32, epochs=20, fit_kwargs={'callbacks': [tf_board_callback]}) # 保存模型 model.save("../model/中医执业药师char-model")
def train(self): x_xiyao, xiyao_y = self.read_message('../data/西药执业药师/train.txt') x_dev, dev_y = self.read_message('../data/西药执业药师/dev.txt') # 获取bert字向量 bert = BERTEmbedding('bert-base-chinese', sequence_length=200) model = BLSTMModel(bert) # 输入模型训练数据 标签 步数 model.fit(x_xiyao, xiyao_y, x_dev, dev_y, epochs=8, batch_size=256, fit_kwargs={'callbacks': [tf_board_callback]}) # 保存模型 model.save("../西药执业药师-model")
def train(self): x_items, train_y = self.read_message('../data/健康管理师分类数据集/train.txt') x_xiyao, xiyao_y = self.read_message('../data/西药执业药师/train.txt') x_yingyangshi, yingyangshi_y = self.read_message( '../data/yingyangshi/train.txt') x_items.extend(x_xiyao) train_y.extend(xiyao_y) x_items.extend(x_yingyangshi) train_y.extend(yingyangshi_y) x_dev, dev_y = self.read_message('../data/健康管理师分类数据集/valid.txt') # 获取bert字向量 bert = BERTEmbedding('bert-base-chinese', sequence_length=200) model = BLSTMModel(bert) # 输入模型训练数据 标签 步数 model.fit(x_items, train_y, x_dev, dev_y, epochs=8, batch_size=128, fit_kwargs={'callbacks': [tf_board_callback]}) # 保存模型 model.save("../健康管理师分字BERT-model")
def pre_train(self): model = BLSTMModel.load_model("../model/中医执业药师classification-model") x_items, train_y = self.read_message('../data/Chinese medicine licensed pharmacist/test.txt') model.evaluate(x_items, train_y)
def pre_evaluate(self): model = BLSTMModel.load_model( "../model/health_manager_multi_bert-model") result = model.predict("")
def setUpClass(cls): cls.epochs = 3 cls.model = BLSTMModel()
def setUpClass(cls): cls.epochs = 3 embedding = EmbeddingManager.get_w2v() cls.model = BLSTMModel(embedding)
class BLSTMModelModelTest(unittest.TestCase): def __init__(self, *args, **kwargs): super(BLSTMModelModelTest, self).__init__(*args, **kwargs) self.__model_class__ = BLSTMModel self.x_data = [ list('语言学(英语:linguistics)是一门关于人类语言的科学研究'), list('语言学(英语:linguistics)是一门关于人类语言的科学研究'), list('语言学(英语:linguistics)是一门关于人类语言的科学研究'), list('语言学包含了几种分支领域。'), list('在语言结构(语法)研究与意义(语义与语用)研究之间存在一个重要的主题划分'), ] self.y_data = ['a', 'a', 'a', 'b', 'c'] self.x_eval = [ list('语言学是一门关于人类语言的科学研究。'), list('语言学包含了几种分支领域。'), list('在语言结构研究与意义研究之间存在一个重要的主题划分。'), list('语法中包含了词法,句法以及语音。'), list('语音学是语言学的一个相关分支,它涉及到语音与非语音声音的实际属性,以及它们是如何发出与被接收到的。'), list('与学习语言不同,语言学是研究所有人类语文发展有关的一门学术科目。'), ] self.y_eval = ['a', 'a', 'a', 'b', 'c', 'a'] def prepare_model(self, embedding: BaseEmbedding = None): self.model = self.__model_class__(embedding) def test_build(self): self.prepare_model() self.model.fit(self.x_data, self.y_data) self.assertEqual(len(self.model.label2idx), 4) self.assertGreater(len(self.model.token2idx), 4) logging.info(self.model.embedding.token2idx) def test_fit(self): self.prepare_model() self.model.fit(self.x_data, self.y_data, x_validate=self.x_eval, y_validate=self.y_eval) def test_label_token_convert(self): self.test_fit() self.assertTrue(isinstance(self.model.convert_label_to_idx('a'), int)) self.assertTrue(isinstance(self.model.convert_idx_to_label(1), str)) self.assertTrue( all( isinstance(i, int) for i in self.model.convert_label_to_idx(['a']))) self.assertTrue( all( isinstance(i, str) for i in self.model.convert_idx_to_label([1, 2]))) sentence = list('在语言结构(语法)研究与意义(语义与语用)研究之间存在一个重要的主题划分') tokens = self.model.embedding.tokenize(sentence) self.assertEqual(len(sentence) + 2, len(tokens)) def test_predict(self): self.test_fit() sentence = list('语言学包含了几种分支领域。') self.assertTrue(isinstance(self.model.predict(sentence), str)) self.assertTrue(isinstance(self.model.predict([sentence]), list)) logging.info('test predict: {} -> {}'.format( sentence, self.model.predict(sentence))) def test_eval(self): self.test_fit() self.model.evaluate(self.x_data, self.y_data) def test_bert(self): embedding = BERTEmbedding('chinese_L-12_H-768_A-12', sequence_length=30) self.prepare_model(embedding) self.model.fit(self.x_data, self.y_data, x_validate=self.x_eval, y_validate=self.y_eval) sentence = list('语言学包含了几种分支领域。') logging.info(self.model.embedding.tokenize(sentence)) logging.info(self.model.predict(sentence)) self.assertTrue(isinstance(self.model.predict(sentence), str)) self.assertTrue(isinstance(self.model.predict([sentence]), list)) def test_word2vec_embedding(self): embedding = WordEmbeddings('sgns.weibo.bigram', sequence_length=30, limit=5000) self.prepare_model(embedding) self.model = BLSTMModel(embedding=embedding) self.model.fit(self.x_data, self.y_data, x_validate=self.x_eval, y_validate=self.y_eval) sentence = list('语言学包含了几种分支领域。') logging.info(self.model.embedding.tokenize(sentence)) logging.info(self.model.predict(sentence)) self.assertTrue(isinstance(self.model.predict(sentence), str)) self.assertTrue(isinstance(self.model.predict([sentence]), list)) def test_save_and_load(self): self.test_fit() model_path = tempfile.gettempdir() self.model.save(model_path) new_model = BLSTMModel.load_model(model_path) self.assertIsNotNone(new_model) sentence = list('语言学包含了几种分支领域。') result = new_model.predict(sentence) self.assertTrue(isinstance(result, str))
log_filepath = r"D:\data\biendata\ccks2019_el\kashgari\m0log" early_stop = EarlyStopping(monitor="val_loss", mode="min", patience=2) # early_stop = EarlyStopping(monitor="val_acc", mode="max", patience=2) log = TensorBoard(log_dir=log_filepath, write_images=False, write_graph=True, histogram_freq=0) # emn_path = r'D:\data\bert\chinese_L-12_H-768_A-12' emn_path = r'D:\data\bert\chinese-bert_chinese_wwm_L-12_H-768_A-12' embedding = BERTEmbedding(emn_path, sequence_length=512) # model = DropoutBGRUModel(embedding) model = BLSTMModel(embedding) model.build_model(x_train, y_train, x_validate, y_validate) model.model.fit_generator() model.fit(train_x, train_y, x_validate=validate_x, y_validate=validate_y, epochs=20, batch_size=128, labels_weight=True, fit_kwargs={'callbacks': [early_stop, log]}) model.evaluate(test_x, test_y) model.save(model_path)
def pre_evaluate(self): model = BLSTMModel.load_model("../健康管理师分字-model") x_items, train_y = self.read_message( '../data/health_manager_v4/test.txt') model.evaluate(x_items, train_y)
def predict_each_line(args, model): import codecs fout = codecs.open(args.output_file, 'w') test_x, test_y = fetch_data_set(args.test_set_path) for line, y in zip(test_x, test_y): result = model.predict(text_processor(''.join(line)), batch_size=1, debug_info=False) if result != ''.join(y): str_message = ''.join(line) + "\t" + ''.join(y) +"\t" + result print(str_message) fout.write(str_message+'\n') fout.close() if __name__ == '__main__': # initialize parameter args = params_setup() logging.basicConfig(filename=args.log_path, level=logging.DEBUG) os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" os.environ["CUDA_VISIBLE_DEVICES"] = args.gpu_id bert_embedding = BERTEmbedding('bert-base-chinese', sequence_length=30) model = BLSTMModel(bert_embedding) model = model.load_model(args.model_path) if (args.predict_mode == "from_input"): predict_from_user_input(model) else: predict_from_test_set(args, model)
log_filepath = r"D:\data\biendata\ccks2019_el\clf_log" early_stop = EarlyStopping(monitor="val_loss", mode="min", patience=2) # early_stop = EarlyStopping(monitor="val_acc", mode="max", patience=2) log = TensorBoard(log_dir=log_filepath, write_images=False, write_graph=True, histogram_freq=0) emn_path = r'D:\data\bert\chinese_L-12_H-768_A-12' embedding = BERTEmbedding(emn_path, sequence_length=1024) emn_path = r'D:/data/word2vec/zh/sgns.target.word-word.dynwin5.thr10.neg5.dim300.iter5/sgns.target.word-word.dynwin5.thr10.neg5.dim300.iter5.utf8.txt' embedding = WordEmbeddings(emn_path, sequence_length=1024) # model = DropoutBGRUModel(embedding) model = BLSTMModel(embedding) model.fit(train_x[:100000], train_y[:100000], x_validate=validate_x[:20000], y_validate=validate_y[:20000], epochs=20, batch_size=256, labels_weight=True, fit_kwargs={'callbacks': [early_stop, log]}) model.evaluate(test_x, test_y) model.save(model_path)
def pre_train(self): bilstm_model = BLSTMModel.load_model('../classification-model') x_items, _ = self.read_message('../data/西药执业药师/dev.txt') for i in x_items: result = bilstm_model.predict(i) print("\n" + result)