import json import numpy as np import codecs from bert4keras.backend import keras, set_gelu from bert4keras.tokenizer import SpTokenizer from bert4keras.bert import build_bert_model from bert4keras.optimizers import Adam, extend_with_piecewise_linear_lr from bert4keras.snippets import sequence_padding, get_all_attributes locals().update(get_all_attributes(keras.layers)) set_gelu('tanh') maxlen = 256 config_path = 'models/albert_base/albert_config.json' checkpoint_path = 'models/albert_base/variables/variables' spm_path = 'models/albert_base/assets/30k-clean.model' def load_data(filename): D = [] with codecs.open(filename, encoding='utf-8') as f: for l in f: text, label = l.strip().split('\t') D.append((text, int(label))) return D train_data = load_data('datasets/IMDB_trainshuffle.data') valid_data = load_data('datasets/IMDB_valshuffle.data') test_data = load_data('datasets/IMDB_testshuffle.data')
# !/usr/bin/python # -*- coding: utf-8 -*- # @time : 2020/4/29 23:43 # @author : Mo # @function: text-classification(tc) from macadam.tc.t00_predict import ModelPredict from macadam.tc.t00_trainer import trainer from macadam.tc.t00_map import graph_map from bert4keras.backend import set_gelu set_gelu("tanh") # "erf" or "tanh"
# val_acc: 0.887071, test_acc: 0.870320 import json import numpy as np from random import choice import re, os, codecs from bert4keras.backend import set_gelu, K from bert4keras.utils import Tokenizer, load_vocab from bert4keras.bert import build_bert_model from bert4keras.train import PiecewiseLinearLearningRate from keras.layers import * from keras.models import Model from keras.optimizers import Adam from keras.callbacks import Callback set_gelu('tanh') # 切换gelu版本 maxlen = 128 config_path = '/root/kg/bert/chinese_wwm_L-12_H-768_A-12/bert_config.json' checkpoint_path = '/root/kg/bert/chinese_wwm_L-12_H-768_A-12/bert_model.ckpt' dict_path = '/root/kg/bert/chinese_wwm_L-12_H-768_A-12/vocab.txt' def load_data(filename): D = [] with codecs.open(filename, encoding='utf-8') as f: for l in f: text1, text2, label = l.strip().split('\t') D.append((text1, text2, int(label))) return D
def train(args): if "bert" in args.model_type: set_gelu("tanh") # 切换gelu版本 # Step1: Load Data data_generator = None if "siamese" in args.model_type: data_generator = SiameseDataGenerator elif "albert" in args.model_type: data_generator = BertDataGenerator train_ds = data_generator(data_path=args.train_data_path, batch_size=args.batch_size, dict_path=args.bert_dict_path, maxlen=args.query_len) dev_ds = data_generator(data_path=args.dev_data_path, batch_size=args.batch_size, maxlen=args.query_len, dict_path=args.bert_dict_path) test_ds = data_generator(data_path=args.test_data_path, batch_size=args.batch_size, maxlen=args.query_len, dict_path=args.bert_dict_path) # Step2: Load Model model = None if "siamese" in args.model_type: model = SiameseBertModel(config_path=args.bert_config_path, checkpoint_path=args.bert_checkpoint_path, dense_units=args.dense_units) elif "albert" in args.model_type: model = BertModel(config_path=args.bert_config_path, checkpoint_path=args.bert_checkpoint_path) model_name = model.__class__.__name__ model = model.get_model() from bert4keras.optimizers import Adam model.compile( loss='sparse_categorical_crossentropy', optimizer=Adam(2e-5), # 用足够小的学习率 # optimizer=PiecewiseLinearLearningRate(Adam(5e-5), {10000: 1, 30000: 0.1}), metrics=['accuracy'], ) evaluator = Evaluator(dev_ds=dev_ds, model_name=model_name, is_bert_model=True, test_ds=test_ds) logger.info("***** Running training *****") logger.info(" Model Class Name = %s", model_name) logger.info(" Num Epochs = %d", args.epoch) model.fit_generator(train_ds.forfit(), steps_per_epoch=len(train_ds), epochs=args.epoch, callbacks=[evaluator], verbose=2) model.load_weights('./checkpoints/best_{}.weight'.format(model_name)) logger.info("***** Test Reslt *****") logger.info(" Model = %s", model_name) logger.info(" Batch Size = %d", args.batch_size) logger.info(" Final Test Acc:%05f", cal_acc(data=test_ds, model=model, is_bert_model=True)) elif "NN" in args.model_type: # Step 1 : Loda Data train_data = pd.read_csv(args.train_data_path) dev_data = pd.read_csv(args.dev_data_path) test_data = pd.read_csv(args.test_data_path) category_count = len(train_data["category"].value_counts()) category_encoder = category_OneHotEncoder(data_df=train_data) loader = LoadData(w2v_path=args.w2v_path, query_len=args.query_len) word2idx = loader.word2idx emd_matrix = loader.emb_matrix """ 注意: shuffle的顺序很重要:一般建议是先执行shuffle方法,接着采用batch方法。 这样是为了保证在整体数据打乱之后再取出batch_size大小的数据。 如果先采取batch方法再采用shuffle方法,那么此时就只是对batch进行shuffle, 而batch里面的数据顺序依旧是有序的,那么随机程度会减弱。 """ train_ds = loader.dataset(encoder=category_encoder, data_df=train_data) train_ds = train_ds.shuffle(buffer_size=len(train_data)).batch( batch_size=args.batch_size).repeat() dev_ds = loader.dataset(encoder=category_encoder, data_df=dev_data) dev_ds = dev_ds.batch(batch_size=args.batch_size) test_ds = loader.dataset(encoder=category_encoder, data_df=test_data) test_ds = test_ds.batch(batch_size=args.batch_size) # Step2: Load Model model = None if "siamese_CNN" in args.model_type: model = SiameseCnnModel(emb_matrix=emd_matrix, word2idx=word2idx, filters_nums=args.filters_nums, kernel_sizes=args.kernel_sizes, dense_units=args.dense_units, label_count=args.label_count, category_count=category_count, query_len=args.query_len, shared=args.feature_shared, add_feature=args.add_features) elif "siamese_RNN" in args.model_type: model = SiameseRnnModel(emb_matrix=emd_matrix, word2idx=word2idx, hidden_units=args.hidden_units, dense_units=args.dense_units, label_count=args.label_count, category_count=category_count, query_len=args.query_len, mask_zero=args.mask_zero, bidirection=args.bi_direction, shared=args.feature_shared, add_feature=args.add_features) model_name = model.__class__.__name__ model = model.get_model() logger.info("***** Running training *****") logger.info(" Model Class Name = %s", model_name) logger.info(" Num examples = %d", len(train_data)) logger.info(" Num Epochs = %d", args.epoch) model.compile(optimizer='adam', loss="binary_crossentropy", metrics=["acc"]) early_stopping = EarlyStopping(monitor="val_acc", patience=3, mode="max") evaluator = Evaluator(dev_ds=dev_ds, model_name=model_name, is_bert_model=False, dev_label=dev_data['label']) # Step3: Train Model history = model.fit(train_ds, callbacks=[early_stopping, evaluator], epochs=args.epoch, steps_per_epoch=len(train_data) // args.batch_size, validation_data=dev_ds, validation_steps=len(dev_data) // args.batch_size) # Step4 : Save model and trainLogs logger.info("***** Training Logs *****") for epoch in history.epoch: logger.info("Epoch %d", epoch) logger.info("train_loss:%f train_acc:%f val_loss:%f val_acc:%f", history.history.get("loss")[epoch], history.history.get("acc")[epoch], history.history.get("val_loss")[epoch], history.history.get("val_acc")[epoch]) # # time_stamp = datetime.datetime.now().strftime('%m-%d_%H-%M-%S') # path = './checkpoints/{}_{}.h5'.format(model_name, time_stamp) # model.save(path) model = load_model('./checkpoints/best_{}.h5'.format(model_name)) y_pred = model.predict(test_ds) y_true = test_data["label"].values.reshape((-1, 1)) y_pred[y_pred > 0.5] = 1 y_pred[y_pred < 0.5] = 0 acc = accuracy_score(y_true, y_pred) precision = precision_score(y_true, y_pred) recall = recall_score(y_true, y_pred) f1 = f1_score(y_true, y_pred) logger.info("***** Pramaters *****") logger.info(" ModelName = %s", args.model_type) logger.info(" Add Features = %s", args.add_features) logger.info(" Embedding dims = %d", len(emd_matrix[0])) logger.info(" BatchSize = %d", args.batch_size) if "CNN" in args.model_type: logger.info(" kernel_sizes = %s", args.kernel_sizes) logger.info(" filters_nums = %s", args.filters_nums) elif "RNN" in args.model_type: logger.info(" hidden_units = %s", args.hidden_units) logger.info(" bi_direction = %s", args.bi_direction) logger.info(" dense_units = %s", args.dense_units) logger.info(" feature_shared = %s", args.feature_shared) logger.info("***** Testing Results *****") logger.info(" Acc = %f", acc) logger.info(" Precision = %f", precision) logger.info(" Recall = %f", recall) logger.info(" F1-score = %f", f1)
# -*- coding: utf-8 -*- from __future__ import absolute_import from __future__ import division from __future__ import print_function from bert4keras.backend import set_gelu from bert4keras.bert import build_bert_model from keras.layers import * from keras.models import Model from keras_contrib.layers import CRF set_gelu("tanh") class NER_Model(object): def __init__(self, model_configs): self.bert_config = model_configs.get("bert_config") self.bert_checkpoint = model_configs.get("bert_checkpoint") self.albert = model_configs.get("albert") self.model_type = model_configs.get("model_type") self.cell_type = model_configs.get("cell_type") self.rnn_units = model_configs.get("rnn_units") self.rnn_layers = model_configs.get("rnn_layers") self.cnn_filters = model_configs.get("cnn_filters") self.cnn_kernel_size = model_configs.get("cnn_kernel_size") self.cnn_blocks = model_configs.get("cnn_blocks") self.crf_only = model_configs.get("crf_only") self.dropout_rate = model_configs.get("dropout_rate") self.max_len = model_configs.get("max_len") self.numb_tags = model_configs.get("numb_tags")
#! -*- coding:utf-8 -*- # 评估脚本 # 数据集:IFLYTEK' 长文本分类 (https://github.com/CLUEbenchmark/CLUE) import json from io import open import numpy as np from bert4keras.backend import keras, set_gelu from bert4keras.tokenizer import Tokenizer from bert4keras.bert import build_bert_model from bert4keras.optimizers import Adam from bert4keras.snippets import sequence_padding, DataGenerator from keras.layers import * set_gelu('tanh') # 切换tanh版本 num_classes = 119 maxlen = 128 batch_size = 32 # RoBERTa small config_path = '/root/kg/bert/chinese_roberta_L-6_H-384_A-12/bert_config.json' checkpoint_path = '/root/kg/bert/chinese_roberta_L-6_H-384_A-12/bert_model.ckpt' dict_path = '/root/kg/bert/chinese_roberta_L-6_H-384_A-12/vocab.txt' model_type = 'bert' """ # albert small config_path = '/root/kg/bert/albert_small_zh_google/albert_config.json' checkpoint_path = '/root/kg/bert/albert_small_zh_google/albert_model.ckpt'
console.setFormatter(formatter) logging.getLogger('').addHandler(console) ################################################################################################# # command parameters parser = argparse.ArgumentParser(description='classifier albert') parser.add_argument('--model', type=str, default='large', help='pre train model: large or xxlarge') parser.add_argument('--do_train', type=int, default=0, help='do train') parser.add_argument('--do_predict', type=int, default=0, help='do predict') parser.add_argument('--bert_path', type=str, default='../ALbert/albert_xxlarge/', help='bert_path') parser.add_argument('--file_pre', type=str, default='a', help='data file name') parser.add_argument('--maxlen', type=int, default=128, help='maxlen') parser.add_argument('--batch_size', type=int, default=32, help='batch_size') args = parser.parse_args() set_gelu('tanh') # gelu do_train = args.do_train do_predict = args.do_predict logging.info('do_train:%d, do_predict:%d' % (do_train, do_predict)) file_pre = args.file_pre # the number of class dic_nums = {'a':2, 'b':2, 'c':3} num_classes = dic_nums[file_pre] maxlen = args.maxlen batch_size = args.batch_size logging.info('Running Parm: File: Training_%s, num_classes:%d, maxlen: %d, batch_size: %d' % (file_pre, num_classes, maxlen, batch_size)) # pre-train model
import numpy as np import pandas as pd import tensorflow as tf from bert4keras.backend import keras, set_gelu, K from bert4keras.tokenizers import Tokenizer from bert4keras.models import build_transformer_model from bert4keras.optimizers import Adam from bert4keras.snippets import sequence_padding, DataGenerator from bert4keras.snippets import open from keras.layers import Dropout, Dense tf.config.list_physical_devices('GPU') set_gelu('tanh') # 切换gelu激活函数的版本 maxlen = 128 batch_size = 32 config_path = './chinese_L-12_H-768_A-12/bert_config.json' checkpoint_path = './chinese_L-12_H-768_A-12/bert_model.ckpt' dict_path = './chinese_L-12_H-768_A-12/vocab.txt' print(batch_size) # 加载数据集 train_df = pd.read_csv('./data/train.csv') valid_df = pd.read_csv('./data/dev.csv') test_df = pd.read_csv('./data/test.csv') train_df.dropna(axis=0, inplace=True) train_data = train_df[['query1', 'query2', 'label']].values valid_data = valid_df[['query1', 'query2', 'label']].values test_data = test_df[['query1', 'query2', 'label']].values