def train(self): x_items, train_y, valid_x, valid_y = self.read_message('car/train.csv') # 获取bert字向量 model = CNNModel() # 输入模型训练数据 标签 步数 model.fit(x_items, train_y, valid_x, valid_y, batch_size=64, epochs=12, callbacks=[tf_board_callback]) # 保存模型 file = pd.read_csv("car/test.csv", encoding='utf-8').values.tolist() test_data = [] id_list = [] for i in file: test_data.append(jieba.lcut(str(i[1]) + str(i[2]))) id_list.append(i[0]) predict_answers = model.predict(x_data=test_data) file = open("data/test_predict_bert_car.csv", 'w', encoding='utf-8') for i, j in zip(id_list, predict_answers): i = i.strip() file.write(str(i) + "," + str(j) + "\n") model.save("../model/news-classification-bert-model")
def train(self): x_items, train_y = read_message() # 获取bert字向量 bert = BERTEmbedding(self.bert_place, sequence_length=256) model = CNNModel(bert) # 输入模型训练数据 标签 步数 model.fit(x_items, train_y, epochs=200, batch_size=32, fit_kwargs={'callbacks': [tf_board_callback]}) # 保存模型 model.save("output/classification-model") model.evaluate(x_items, train_y)
def train(): x_items, train_y = read_message() # 获取bert字向量 model = CNNModel(bert) # 输入模型训练数据 标签 步数 model.fit(x_items, train_y, epochs=20, class_weight=True, fit_kwargs={'callbacks': [tf_board_callback]}) # 保存模型 model.save("../classification-model") for i in x_items: result = model.predict(i) print("\n" + result)
def train(self): x_train, train_y = self.read_message('../data/西药执业药师/train.txt') x_dev, dev_y = self.read_message('../data/西药执业药师/test.txt') x_test, test_y = self.read_message('../data/西药执业药师/dev.txt') # 获取bert字向量 bert = BERTEmbedding('bert-base-chinese', sequence_length=100) # 获取词向量 # embedding = WordEmbeddings('sgns.weibo.bigram.bz2', 50) long_model = CNNModel(bert) # 输入模型训练数据 标签 步数 long_model.fit(x_train, train_y, x_dev, dev_y, epochs=20, batch_size=128, fit_kwargs={'callbacks': [tf_board_callback]}) # 保存模型 long_model.save("../classification-model") result = long_model.evaluate(x_test, test_y) return result
import tqdm import jieba from kashgari.tasks.classification import CNNModel def read_data_file(path): lines = open(path, 'r', encoding='utf-8').read().splitlines() x_list = [] y_list = [] for line in tqdm.tqdm(lines): rows = line.split('\t') if len(rows) >= 2: y_list.append(rows[0]) x_list.append(list(jieba.cut('\t'.join(rows[1:])))) else: print(rows) return x_list, y_list test_x, test_y = read_data_file('cnews/cnews.test.txt') train_x, train_y = read_data_file('cnews/cnews.train.txt') val_x, val_y = read_data_file('cnews/cnews.val.txt') model = CNNModel() model.fit(train_x, train_y, val_x, val_y, batch_size=128) result = model.evaluate(test_x, test_y) model.save('model/kashgari/cnn')
word2idx[k.BOS] = word2idx['pad'] word2idx[k.EOS] = word2idx['pad'] self.token2idx = word2idx def build_token2idx_dict(self, x_data: List[TextSeqType], min_count: int = 5): logging.debug( "word2vec embedding no need to build token2idx with corpus") if __name__ == '__main__': train_x = [ list('语言学(英语:linguistics)是一门关于人类语言的科学研究'), list('语言学(英语:linguistics)是一门关于人类语言的科学研究'), list('语言学(英语:linguistics)是一门关于人类语言的科学研究'), list('语言学包含了几种分支领域。'), list('在语言结构(语法)研究与意义(语义与语用)研究之间存在一个重要的主题划分'), ] train_y = ['a', 'a', 'a', 'b', 'c'] from kashgari.utils.logger import init_logger from kashgari.tasks.classification import CNNModel init_logger() embedding = GPT2Embedding( '/Users/brikerman/Desktop/python/gpt-2/models/117M', 10) r = embedding.embed(['hello', 'world']) model = CNNModel(embedding) model.fit(train_x, train_y, epochs=20) print(r.shape)
tf_board_callback = keras.callbacks.TensorBoard(log_dir='tf_dir', update_freq=10) from kashgari.tasks.classification import CNNLSTMModel, CNNModel save = ModelCheckpoint( os.path.join('model_dir', 'CNNModel_bert.h5'), monitor='val_acc', verbose=1, save_best_only=True, mode='auto' ) early_stopping = EarlyStopping( monitor='val_acc', min_delta=0, patience=8, verbose=1, mode='auto' ) model = CNNModel(embed) # ------------ build model ------------ model.fit( train_features, train_labels, valid_features, valid_labels, epochs=60, batch_size=256, callbacks=[tf_board_callback, save, early_stopping] ) model.evaluate(test_features, test_labels)