def train(self):
        x_items, train_y, valid_x, valid_y = self.read_message('car/train.csv')
        # 获取bert字向量

        model = CNNModel()
        # 输入模型训练数据 标签 步数
        model.fit(x_items,
                  train_y,
                  valid_x,
                  valid_y,
                  batch_size=64,
                  epochs=12,
                  callbacks=[tf_board_callback])
        # 保存模型
        file = pd.read_csv("car/test.csv", encoding='utf-8').values.tolist()
        test_data = []
        id_list = []
        for i in file:
            test_data.append(jieba.lcut(str(i[1]) + str(i[2])))
            id_list.append(i[0])
        predict_answers = model.predict(x_data=test_data)
        file = open("data/test_predict_bert_car.csv", 'w', encoding='utf-8')
        for i, j in zip(id_list, predict_answers):
            i = i.strip()
            file.write(str(i) + "," + str(j) + "\n")
        model.save("../model/news-classification-bert-model")
Example #2
0
def test_dataset(model_dir: str) -> list:
	# 从数据库中获取正文并使用模型进行预测分类,
	# 预测结果写回数据库
	conn = pymysql.connect(host=DB_HOST,
                        port=int(DB_PORT),
                        user=DB_USER,
                        password=DB_PASS,
                        db=DB_NAME,
                        charset=DB_CHARSET
                        )
	cursor = conn.cursor()
	cursor.execute("""
		SELECT `page_text`,`page_title`,`category`,`hash` FROM `webpage_text`
		WHERE `%s_predict` IS NULL ORDER BY `time` desc
		""" % model_dir.split('.model')[0].split('/')[-1] 
		)
	all_text = []
	data = cursor.fetchall()
	# 判断预测使用的模型
	if 'cnn.model' in model_dir:
		model = CNNModel.load_model(model_dir)
	elif 'cnnlstm.model' in model_dir:
		model = CNNLSTMModel.load_model(model_dir)
	elif 'blstm.model' in model_dir:
		model = BLSTMModel.load_model(model_dir)
	for i in tqdm.tqdm(data):
		label = i[2]
		# 将文章分词,拼接标题与正文
		content = strip_stopwords(list(jieba.cut(i[0] + '。' + i[1])))
		all_text += content
		predict = model.predict(content)
		cursor.execute(
			'UPDATE `webpage_text` SET {model}_predict="{predict}"'.format(model=model_dir.split('.model')[0].split('/')[-1],predict=predict)+
			'WHERE hash="%s"' % i[3]
			)
		conn.commit()
		# print('[+] Predict:'+predict+', Label:'+label+', Title:'+i[1])

	# 计算词频并将排行前100的热词写入数据库
	c = Counter(all_text)
	i = 1
	cursor.execute(
		'DELETE FROM `hot_key` WHERE 1=1'
		)
	conn.commit()
	for k,v in c.most_common(100):
		if len(k) == 1:
			continue
		cursor.execute(
			'INSERT INTO `hot_key` VALUES ({0}, "{1}", {2})'.format(i, k, v)
			)
		conn.commit()
		i += 1
	print('[+] Success')
 def train(self):
     x_items, train_y = read_message()
     # 获取bert字向量
     bert = BERTEmbedding(self.bert_place, sequence_length=256)
     model = CNNModel(bert)
     # 输入模型训练数据 标签 步数
     model.fit(x_items,
               train_y,
               epochs=200,
               batch_size=32,
               fit_kwargs={'callbacks': [tf_board_callback]})
     # 保存模型
     model.save("output/classification-model")
     model.evaluate(x_items, train_y)
def train():
    x_items, train_y = read_message()
    # 获取bert字向量
    model = CNNModel(bert)
    # 输入模型训练数据 标签 步数
    model.fit(x_items,
              train_y,
              epochs=20,
              class_weight=True,
              fit_kwargs={'callbacks': [tf_board_callback]})
    # 保存模型
    model.save("../classification-model")
    for i in x_items:
        result = model.predict(i)
        print("\n" + result)
    def train(self):
        x_train, train_y = self.read_message('../data/西药执业药师/train.txt')
        x_dev, dev_y = self.read_message('../data/西药执业药师/test.txt')
        x_test, test_y = self.read_message('../data/西药执业药师/dev.txt')
        # 获取bert字向量
        bert = BERTEmbedding('bert-base-chinese', sequence_length=100)
        # 获取词向量
        # embedding = WordEmbeddings('sgns.weibo.bigram.bz2', 50)

        long_model = CNNModel(bert)
        # 输入模型训练数据 标签 步数
        long_model.fit(x_train,
                       train_y,
                       x_dev,
                       dev_y,
                       epochs=20,
                       batch_size=128,
                       fit_kwargs={'callbacks': [tf_board_callback]})
        # 保存模型
        long_model.save("../classification-model")
        result = long_model.evaluate(x_test, test_y)
        return result
 def setUpClass(cls):
     cls.epochs = 3
     embedding = EmbeddingManager.get_w2v()
     cls.model = CNNModel(embedding)
 def setUpClass(cls):
     cls.epochs = 3
     TestCNNModel.model = CNNModel()
 def setUpClass(cls):
     cls.epochs = 1
     embedding = EmbeddingManager.get_bert()
     TestCNNModelWithBERT.model = CNNModel(embedding)
Example #9
0
import tqdm
import jieba
from kashgari.tasks.classification import CNNModel


def read_data_file(path):
    lines = open(path, 'r', encoding='utf-8').read().splitlines()
    x_list = []
    y_list = []
    for line in tqdm.tqdm(lines):
        rows = line.split('\t')
        if len(rows) >= 2:
            y_list.append(rows[0])
            x_list.append(list(jieba.cut('\t'.join(rows[1:]))))
        else:
            print(rows)
    return x_list, y_list


test_x, test_y = read_data_file('cnews/cnews.test.txt')
train_x, train_y = read_data_file('cnews/cnews.train.txt')
val_x, val_y = read_data_file('cnews/cnews.val.txt')

model = CNNModel()
model.fit(train_x, train_y, val_x, val_y, batch_size=128)
result = model.evaluate(test_x, test_y)
model.save('model/kashgari/cnn')
Example #10
0
        word2idx[k.BOS] = word2idx['pad']
        word2idx[k.EOS] = word2idx['pad']
        self.token2idx = word2idx

    def build_token2idx_dict(self,
                             x_data: List[TextSeqType],
                             min_count: int = 5):
        logging.debug(
            "word2vec embedding no need to build token2idx with corpus")


if __name__ == '__main__':
    train_x = [
        list('语言学(英语:linguistics)是一门关于人类语言的科学研究'),
        list('语言学(英语:linguistics)是一门关于人类语言的科学研究'),
        list('语言学(英语:linguistics)是一门关于人类语言的科学研究'),
        list('语言学包含了几种分支领域。'),
        list('在语言结构(语法)研究与意义(语义与语用)研究之间存在一个重要的主题划分'),
    ]
    train_y = ['a', 'a', 'a', 'b', 'c']

    from kashgari.utils.logger import init_logger
    from kashgari.tasks.classification import CNNModel
    init_logger()
    embedding = GPT2Embedding(
        '/Users/brikerman/Desktop/python/gpt-2/models/117M', 10)
    r = embedding.embed(['hello', 'world'])
    model = CNNModel(embedding)
    model.fit(train_x, train_y, epochs=20)
    print(r.shape)
 def pre_train(self):
     model = CNNModel.load_model("output/classification-model")
     x_items, train_y = read_message()
     model.evaluate(self, x_items, train_y)
Example #12
0
tf_board_callback = keras.callbacks.TensorBoard(log_dir='tf_dir', update_freq=10)

from kashgari.tasks.classification import CNNLSTMModel, CNNModel

save = ModelCheckpoint(
    os.path.join('model_dir', 'CNNModel_bert.h5'),
    monitor='val_acc',
    verbose=1,
    save_best_only=True,
    mode='auto'
)
early_stopping = EarlyStopping(
    monitor='val_acc',
    min_delta=0,
    patience=8,
    verbose=1,
    mode='auto'
)
model = CNNModel(embed)

# ------------ build model ------------
model.fit(
    train_features, train_labels,
    valid_features, valid_labels,
    epochs=60,
    batch_size=256,
    callbacks=[tf_board_callback, save, early_stopping]
)
model.evaluate(test_features, test_labels)
            y_list.append(rows[0])
            x_list.append(list(jieba.cut('\t'.join(rows[1:]))))
        else:
            print(rows)
    return x_list, y_list


test_x, test_y = read_data_file('cnews/cnews.test.txt')
train_x, train_y = read_data_file('cnews/cnews.train.txt')
val_x, val_y = read_data_file('cnews/cnews.val.txt')

# 初始化 word2vec embedding

import kashgari
# 初始化 word2vec embedding
from kashgari.embeddings import WordEmbedding

from kashgari.tasks.classification import CNNModel
model = CNNModel()
# 初始化 BERT embedding
# from kashgari.embeddings import BERTEmbedding
# # embedding = BERTEmbedding('bert-base-chinese', sequence_length=600)
# #
# # # 使用 embedding 初始化模型
# # from kashgari.tasks.classification import CNNModel
# # model = CNNModel(embedding)

model.fit(train_x, train_y, val_x, val_y, batch_size=128)
model.evaluate(test_x, test_y)
model.save('./model_cnn')