def train_model(self): x_train, y_train, x_valid, y_valid = self.data_load( validation_split=0.2) model = BLSTMCRFModel(self.embedding) model.fit(x_train, y_train, x_validate=x_valid, y_validate=y_valid, epochs=self.EPOCHS, batch_size=self.BATCH_SIZE) model.save(self.model_path)
def main(): # parser config config_file = "./config.ini" cp = ConfigParser() cp.read(config_file) # default config output_fold = cp["TRAIN"].get("output_fold") epochs = cp["TRAIN"].getint("epochs") batch_size = cp["TRAIN"].getint("batch_size") generator_workers = cp["TRAIN"].getint("generator_workers") output_weights_name = cp["TRAIN"].get("output_weights_name") sequence_length_max = cp["TRAIN"].getint("sequence_length_max") output_model_name = cp["TRAIN"].get("output_model_name") save_weights_only = cp["TRAIN"].getboolean("save_weights_only") cyclicLR_mode = cp["TRAIN"].get("cyclicLR_mode") base_lr = cp["TRAIN"].getfloat("base_lr") max_lr = cp["TRAIN"].getfloat("max_lr") today = datetime.date.today() formatted_today = today.strftime('%y%m%d') output_dir = os.path.join('experiments', formatted_today, output_fold) if not os.path.isdir(output_dir): os.makedirs(output_dir) output_dir_src = os.path.join(output_dir, 'src') if not os.path.isdir(output_dir_src): os.makedirs(output_dir_src) print(f"backup config file to {output_dir_src}") shutil.copy(config_file, os.path.join(output_dir_src, os.path.split(config_file)[1])) train_file = os.path.basename(__file__) shutil.copy(train_file, os.path.join(output_dir_src, train_file)) train_x, train_y = CoNLL2003Corpus.get_sequence_tagging_data('train') validate_x, validate_y = CoNLL2003Corpus.get_sequence_tagging_data( 'validate') test_x, test_y = CoNLL2003Corpus.get_sequence_tagging_data('test') #'bert-large-cased' embedding = BERTEmbedding('bert-large-cased', sequence_length_max) # 还可以选择 `BLSTMModel` 和 `CNNLSTMModel` model = BLSTMCRFModel(embedding) # model.build_model(train_x, train_y) # model.build_multi_gpu_model(gpus=2) # print(model.summary()) if save_weights_only: model_weights = os.path.join(output_dir, output_weights_name) else: model_weights = os.path.join(output_dir, output_model_name) checkpoint = ModelCheckpoint( model_weights, save_weights_only=save_weights_only, save_best_only=True, verbose=1, ) earlystop = EarlyStopping(monitor='val_loss', min_delta=0, patience=20, verbose=0, mode='min') csv_logger = CSVLogger(os.path.join(output_dir, 'training.csv')) batch_size_cycliclr = ceil(len(train_x) / batch_size) if cyclicLR_mode == 'exp_range': gamma = 0.99994 else: gamma = 1. clr = CyclicLR(mode=cyclicLR_mode, step_size=batch_size_cycliclr, base_lr=base_lr, max_lr=max_lr, gamma=gamma) save_min_loss = SaveMinLoss(filepath=output_dir) tb = TensorBoard(log_dir=os.path.join(output_dir, "logs"), batch_size=batch_size) callbacks = [ checkpoint, tb, csv_logger, # clr, save_min_loss, earlystop, ] print("** start training **") model.fit(train_x, train_y, x_validate=validate_x, y_validate=validate_y, epochs=epochs, batch_size=batch_size, labels_weight=True, fit_kwargs={ 'callbacks': callbacks, 'workers': generator_workers, 'use_multiprocessing': True, 'class_weight': 'auto', }) model_path = os.path.join(output_dir, 'model') model.save(model_path) report_evaluate = model.evaluate(test_x, test_y, debug_info=True) with open(os.path.join(output_dir, 'report_evaluate.log'), 'w') as f: f.write(f"The evaluate report is : \n{str(report_evaluate)}")
model.__base_hyper_parameters__ = { 'lstm_layer': { 'units': 256, 'return_sequences': True }, 'dense_layer': { 'units': 64, 'activation': 'tanh' } } model.fit(train_x, train_y, x_validate=validate_x, y_validate=validate_y, epochs=20, batch_size=512, labels_weight=True, fit_kwargs={'callbacks': [early_stop, log]}) model.evaluate(test_x, test_y) model.save(model_path) """ ep20 precision recall f1-score support fictionalhuman 0.7541 0.7700 0.7620 661 tvshow 0.8809 0.9703 0.9234 404 place 0.8156 0.8581 0.8363 1402 thing 0.7811 0.7332 0.7564 5746 vocabulary 0.8681 0.7880 0.8261 15190
def reduce_text(news): text = news['title'] + '。' + news['content'] text = text.replace('\n', '').replace('\t', '') return list(text) if __name__ == '__main__': start = time() print('train start') train_x, train_y = get_train_data('data/train_text.txt') embedding = BERTEmbedding("bert-base-chinese", sequence_length=512) model = BLSTMCRFModel(embedding) length = int(len(train_x) * 0.9) print(len(train_x[:length]), len(train_y[:length])) model.fit(train_x[:length], train_y[:length], train_x[length:], train_y[length:], epochs=5, batch_size=20) # model.fit(train_x[:length], train_y[:length], train_x[length:], train_y[length:], epochs=5, batch_size=128, # labels_weight=True, default_labels_weight=100) valid_x = train_x[length:] valid_y = train_y[length:] model.save('models') print('train end') print('predict start') try: model = BLSTMCRFModel.load_model('models') except Exception: print('模型加载失败') newsId_set = set() try: with open('data/result_bert.txt', 'r', encoding='utf-8') as file: for line in file:
from kashgari.embeddings import BERTEmbedding from kashgari.tasks.seq_labeling import BLSTMCRFModel import os os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' from util import InputHelper train_x, train_y = InputHelper().read_corpus('data', 'Bert_train') embedding = BERTEmbedding('./chinese_L-12_H-768_A-12', sequence_length=256) model = BLSTMCRFModel(embedding) model.fit(train_x, train_y, epochs=10, batch_size=512) model.save('./model')