def train(self): x_items, train_y, valid_x, valid_y = self.read_message('car/train.csv') # 获取bert字向量 model = CNNModel() # 输入模型训练数据 标签 步数 model.fit(x_items, train_y, valid_x, valid_y, batch_size=64, epochs=12, callbacks=[tf_board_callback]) # 保存模型 file = pd.read_csv("car/test.csv", encoding='utf-8').values.tolist() test_data = [] id_list = [] for i in file: test_data.append(jieba.lcut(str(i[1]) + str(i[2]))) id_list.append(i[0]) predict_answers = model.predict(x_data=test_data) file = open("data/test_predict_bert_car.csv", 'w', encoding='utf-8') for i, j in zip(id_list, predict_answers): i = i.strip() file.write(str(i) + "," + str(j) + "\n") model.save("../model/news-classification-bert-model")
def train(self): x_items, train_y = read_message() # 获取bert字向量 bert = BERTEmbedding(self.bert_place, sequence_length=256) model = CNNModel(bert) # 输入模型训练数据 标签 步数 model.fit(x_items, train_y, epochs=200, batch_size=32, fit_kwargs={'callbacks': [tf_board_callback]}) # 保存模型 model.save("output/classification-model") model.evaluate(x_items, train_y)
def train(): x_items, train_y = read_message() # 获取bert字向量 model = CNNModel(bert) # 输入模型训练数据 标签 步数 model.fit(x_items, train_y, epochs=20, class_weight=True, fit_kwargs={'callbacks': [tf_board_callback]}) # 保存模型 model.save("../classification-model") for i in x_items: result = model.predict(i) print("\n" + result)
def train(self): x_train, train_y = self.read_message('../data/西药执业药师/train.txt') x_dev, dev_y = self.read_message('../data/西药执业药师/test.txt') x_test, test_y = self.read_message('../data/西药执业药师/dev.txt') # 获取bert字向量 bert = BERTEmbedding('bert-base-chinese', sequence_length=100) # 获取词向量 # embedding = WordEmbeddings('sgns.weibo.bigram.bz2', 50) long_model = CNNModel(bert) # 输入模型训练数据 标签 步数 long_model.fit(x_train, train_y, x_dev, dev_y, epochs=20, batch_size=128, fit_kwargs={'callbacks': [tf_board_callback]}) # 保存模型 long_model.save("../classification-model") result = long_model.evaluate(x_test, test_y) return result
import tqdm import jieba from kashgari.tasks.classification import CNNModel def read_data_file(path): lines = open(path, 'r', encoding='utf-8').read().splitlines() x_list = [] y_list = [] for line in tqdm.tqdm(lines): rows = line.split('\t') if len(rows) >= 2: y_list.append(rows[0]) x_list.append(list(jieba.cut('\t'.join(rows[1:])))) else: print(rows) return x_list, y_list test_x, test_y = read_data_file('cnews/cnews.test.txt') train_x, train_y = read_data_file('cnews/cnews.train.txt') val_x, val_y = read_data_file('cnews/cnews.val.txt') model = CNNModel() model.fit(train_x, train_y, val_x, val_y, batch_size=128) result = model.evaluate(test_x, test_y) model.save('model/kashgari/cnn')
y_list.append(rows[0]) x_list.append(list(jieba.cut('\t'.join(rows[1:])))) else: print(rows) return x_list, y_list test_x, test_y = read_data_file('cnews/cnews.test.txt') train_x, train_y = read_data_file('cnews/cnews.train.txt') val_x, val_y = read_data_file('cnews/cnews.val.txt') # 初始化 word2vec embedding import kashgari # 初始化 word2vec embedding from kashgari.embeddings import WordEmbedding from kashgari.tasks.classification import CNNModel model = CNNModel() # 初始化 BERT embedding # from kashgari.embeddings import BERTEmbedding # # embedding = BERTEmbedding('bert-base-chinese', sequence_length=600) # # # # # 使用 embedding 初始化模型 # # from kashgari.tasks.classification import CNNModel # # model = CNNModel(embedding) model.fit(train_x, train_y, val_x, val_y, batch_size=128) model.evaluate(test_x, test_y) model.save('./model_cnn')