def train(self): x_items, train_y, valid_x, valid_y = self.read_message('car/train.csv') # 获取bert字向量 model = CNNModel() # 输入模型训练数据 标签 步数 model.fit(x_items, train_y, valid_x, valid_y, batch_size=64, epochs=12, callbacks=[tf_board_callback]) # 保存模型 file = pd.read_csv("car/test.csv", encoding='utf-8').values.tolist() test_data = [] id_list = [] for i in file: test_data.append(jieba.lcut(str(i[1]) + str(i[2]))) id_list.append(i[0]) predict_answers = model.predict(x_data=test_data) file = open("data/test_predict_bert_car.csv", 'w', encoding='utf-8') for i, j in zip(id_list, predict_answers): i = i.strip() file.write(str(i) + "," + str(j) + "\n") model.save("../model/news-classification-bert-model")
def test_dataset(model_dir: str) -> list: # 从数据库中获取正文并使用模型进行预测分类, # 预测结果写回数据库 conn = pymysql.connect(host=DB_HOST, port=int(DB_PORT), user=DB_USER, password=DB_PASS, db=DB_NAME, charset=DB_CHARSET ) cursor = conn.cursor() cursor.execute(""" SELECT `page_text`,`page_title`,`category`,`hash` FROM `webpage_text` WHERE `%s_predict` IS NULL ORDER BY `time` desc """ % model_dir.split('.model')[0].split('/')[-1] ) all_text = [] data = cursor.fetchall() # 判断预测使用的模型 if 'cnn.model' in model_dir: model = CNNModel.load_model(model_dir) elif 'cnnlstm.model' in model_dir: model = CNNLSTMModel.load_model(model_dir) elif 'blstm.model' in model_dir: model = BLSTMModel.load_model(model_dir) for i in tqdm.tqdm(data): label = i[2] # 将文章分词,拼接标题与正文 content = strip_stopwords(list(jieba.cut(i[0] + '。' + i[1]))) all_text += content predict = model.predict(content) cursor.execute( 'UPDATE `webpage_text` SET {model}_predict="{predict}"'.format(model=model_dir.split('.model')[0].split('/')[-1],predict=predict)+ 'WHERE hash="%s"' % i[3] ) conn.commit() # print('[+] Predict:'+predict+', Label:'+label+', Title:'+i[1]) # 计算词频并将排行前100的热词写入数据库 c = Counter(all_text) i = 1 cursor.execute( 'DELETE FROM `hot_key` WHERE 1=1' ) conn.commit() for k,v in c.most_common(100): if len(k) == 1: continue cursor.execute( 'INSERT INTO `hot_key` VALUES ({0}, "{1}", {2})'.format(i, k, v) ) conn.commit() i += 1 print('[+] Success')
def train(self): x_items, train_y = read_message() # 获取bert字向量 bert = BERTEmbedding(self.bert_place, sequence_length=256) model = CNNModel(bert) # 输入模型训练数据 标签 步数 model.fit(x_items, train_y, epochs=200, batch_size=32, fit_kwargs={'callbacks': [tf_board_callback]}) # 保存模型 model.save("output/classification-model") model.evaluate(x_items, train_y)
def train(): x_items, train_y = read_message() # 获取bert字向量 model = CNNModel(bert) # 输入模型训练数据 标签 步数 model.fit(x_items, train_y, epochs=20, class_weight=True, fit_kwargs={'callbacks': [tf_board_callback]}) # 保存模型 model.save("../classification-model") for i in x_items: result = model.predict(i) print("\n" + result)
def train(self): x_train, train_y = self.read_message('../data/西药执业药师/train.txt') x_dev, dev_y = self.read_message('../data/西药执业药师/test.txt') x_test, test_y = self.read_message('../data/西药执业药师/dev.txt') # 获取bert字向量 bert = BERTEmbedding('bert-base-chinese', sequence_length=100) # 获取词向量 # embedding = WordEmbeddings('sgns.weibo.bigram.bz2', 50) long_model = CNNModel(bert) # 输入模型训练数据 标签 步数 long_model.fit(x_train, train_y, x_dev, dev_y, epochs=20, batch_size=128, fit_kwargs={'callbacks': [tf_board_callback]}) # 保存模型 long_model.save("../classification-model") result = long_model.evaluate(x_test, test_y) return result
def setUpClass(cls): cls.epochs = 3 embedding = EmbeddingManager.get_w2v() cls.model = CNNModel(embedding)
def setUpClass(cls): cls.epochs = 3 TestCNNModel.model = CNNModel()
def setUpClass(cls): cls.epochs = 1 embedding = EmbeddingManager.get_bert() TestCNNModelWithBERT.model = CNNModel(embedding)
import tqdm import jieba from kashgari.tasks.classification import CNNModel def read_data_file(path): lines = open(path, 'r', encoding='utf-8').read().splitlines() x_list = [] y_list = [] for line in tqdm.tqdm(lines): rows = line.split('\t') if len(rows) >= 2: y_list.append(rows[0]) x_list.append(list(jieba.cut('\t'.join(rows[1:])))) else: print(rows) return x_list, y_list test_x, test_y = read_data_file('cnews/cnews.test.txt') train_x, train_y = read_data_file('cnews/cnews.train.txt') val_x, val_y = read_data_file('cnews/cnews.val.txt') model = CNNModel() model.fit(train_x, train_y, val_x, val_y, batch_size=128) result = model.evaluate(test_x, test_y) model.save('model/kashgari/cnn')
word2idx[k.BOS] = word2idx['pad'] word2idx[k.EOS] = word2idx['pad'] self.token2idx = word2idx def build_token2idx_dict(self, x_data: List[TextSeqType], min_count: int = 5): logging.debug( "word2vec embedding no need to build token2idx with corpus") if __name__ == '__main__': train_x = [ list('语言学(英语:linguistics)是一门关于人类语言的科学研究'), list('语言学(英语:linguistics)是一门关于人类语言的科学研究'), list('语言学(英语:linguistics)是一门关于人类语言的科学研究'), list('语言学包含了几种分支领域。'), list('在语言结构(语法)研究与意义(语义与语用)研究之间存在一个重要的主题划分'), ] train_y = ['a', 'a', 'a', 'b', 'c'] from kashgari.utils.logger import init_logger from kashgari.tasks.classification import CNNModel init_logger() embedding = GPT2Embedding( '/Users/brikerman/Desktop/python/gpt-2/models/117M', 10) r = embedding.embed(['hello', 'world']) model = CNNModel(embedding) model.fit(train_x, train_y, epochs=20) print(r.shape)
def pre_train(self): model = CNNModel.load_model("output/classification-model") x_items, train_y = read_message() model.evaluate(self, x_items, train_y)
tf_board_callback = keras.callbacks.TensorBoard(log_dir='tf_dir', update_freq=10) from kashgari.tasks.classification import CNNLSTMModel, CNNModel save = ModelCheckpoint( os.path.join('model_dir', 'CNNModel_bert.h5'), monitor='val_acc', verbose=1, save_best_only=True, mode='auto' ) early_stopping = EarlyStopping( monitor='val_acc', min_delta=0, patience=8, verbose=1, mode='auto' ) model = CNNModel(embed) # ------------ build model ------------ model.fit( train_features, train_labels, valid_features, valid_labels, epochs=60, batch_size=256, callbacks=[tf_board_callback, save, early_stopping] ) model.evaluate(test_features, test_labels)
y_list.append(rows[0]) x_list.append(list(jieba.cut('\t'.join(rows[1:])))) else: print(rows) return x_list, y_list test_x, test_y = read_data_file('cnews/cnews.test.txt') train_x, train_y = read_data_file('cnews/cnews.train.txt') val_x, val_y = read_data_file('cnews/cnews.val.txt') # 初始化 word2vec embedding import kashgari # 初始化 word2vec embedding from kashgari.embeddings import WordEmbedding from kashgari.tasks.classification import CNNModel model = CNNModel() # 初始化 BERT embedding # from kashgari.embeddings import BERTEmbedding # # embedding = BERTEmbedding('bert-base-chinese', sequence_length=600) # # # # # 使用 embedding 初始化模型 # # from kashgari.tasks.classification import CNNModel # # model = CNNModel(embedding) model.fit(train_x, train_y, val_x, val_y, batch_size=128) model.evaluate(test_x, test_y) model.save('./model_cnn')