def train_ner(): import os from bert_base.train.train_helper import get_args_parser from bert_base.train.bert_lstm_ner import train args = get_args_parser() args.label_list = prefix + 'data_dir/labels.txt' args.init_checkpoint = prefix + 'init_checkpoint/bert_model.ckpt' args.data_dir = prefix + 'data_dir/' args.output_dir = 'out_dir/' args.bert_config_file = prefix + 'init_checkpoint/bert_config.json' args.vocab_file = prefix + 'init_checkpoint/vocab.txt' args.verbose = True args.gpu_memory_fraction = 1.0 args.do_predict = False # args.save_checkpoints_steps = 5000 # args.save_summary_steps = 5000 args.clean = True if True: import sys param_str = '\n'.join(['%20s = %s' % (k, v) for k, v in sorted(vars(args).items())]) print('usage: %s\n%20s %s\n%s\n%s\n' % (' '.join(sys.argv), 'ARG', 'VALUE', '_' * 50, param_str)) print(args) os.environ['CUDA_VISIBLE_DEVICES'] = args.device_map tf.logging.set_verbosity(tf.logging.INFO) train(args=args)
def train_ner(): args = get_args_parser() if True: import sys param_str = '\n'.join(['%20s = %s' % (k, v) for k, v in sorted(vars(args).items())]) print('usage: %s\n%20s %s\n%s\n%s\n' % (' '.join(sys.argv), 'ARG', 'VALUE', '_' * 50, param_str)) print(args) os.environ['CUDA_VISIBLE_DEVICES'] = args.device_map train(args=args)
def train_bert_class(): import os from bert_base.train.train_helper import get_args_parser from bert_base.train.bert_classifier import train args = get_args_parser() if True: import sys param_str = '\n'.join(['%20s = %s' % (k, v) for k, v in sorted(vars(args).items())]) print('usage: %s\n%20s %s\n%s\n%s\n' % (' '.join(sys.argv), 'ARG', 'VALUE', '_' * 50, param_str)) print(args) os.environ['CUDA_VISIBLE_DEVICES'] = args.device_map train(args)
def train_ner(): import os from bert_base.train.train_helper import get_args_parser # from bert_base.train.bert_lstm_ner import train#train和eval同时进行 from bert_base.train.bert_lstm_ner_train_inpend_eval import train #train和eval可以分开进行 args = get_args_parser() if True: import sys param_str = '\n'.join( ['%20s = %s' % (k, v) for k, v in sorted(vars(args).items())]) print('usage: %s\n%20s %s\n%s\n%s\n' % (' '.join(sys.argv), 'ARG', 'VALUE', '_' * 50, param_str)) # print(args) os.environ['CUDA_VISIBLE_DEVICES'] = args.device_map train(args=args)
def train_ner(): import os from bert_base.train.train_helper import get_args_parser from bert_lstm_ner import train args = get_args_parser() if True: import sys param_str = '\n'.join(['%20s = %s' % (k, v) for k, v in sorted(vars(args).items())]) print('usage: %s\n%20s %s\n%s\n%s\n' % (' '.join(sys.argv), 'ARG', 'VALUE', '_' * 50, param_str)) # print(args) os.environ['CUDA_VISIBLE_DEVICES'] = args.device_map train(args=args) # if __name__ == '__main__': # # start_server() # train_ner()
def get_ner_list(sentences): args = get_args_parser() bert_dir = r'NER_model/chinese_L-12_H-768_A-12' tokenizer = tokenization.FullTokenizer(vocab_file=os.path.join( bert_dir, 'vocab.txt'), do_lower_case=args.do_lower_case) bc = BertClient(show_server_config=False, check_version=False, check_length=False, mode='NER') rst = bc.encode(sentences) res = NER_Result() entities = [] # print('rst:', rst) for (one_str, one_rst) in zip(sentences, rst): ners = res.result_to_json(tokenizer.tokenize(one_str), one_rst) entities.append(ners) return entities
def train_ner(): import os from bert_base.train.train_helper import get_args_parser from bert_base.train.bert_lstm_ner import train args = get_args_parser() if True: import sys param_str = '\n'.join( ['%20s = %s' % (k, v) for k, v in sorted(vars(args).items())]) print('usage: %s\n%20s %s\n%s\n%s\n' % (' '.join(sys.argv), 'ARG', 'VALUE', '_' * 50, param_str)) print(args) os.environ['CUDA_VISIBLE_DEVICES'] = args.device_map if os.path.isdir(args.output_dir): shutil.rmtree(args.output_dir) train(args=args)
def train_ner(): import os from bert_base.train.train_helper import get_args_parser from bert_base.train.bert_lstm_ner import train bert_path = r'D:\localE\code\daguang_extract\BERT-BiLSTM-CRF-NER-tjl\chinese_L-12_H-768_A-12\MSRA' root_path = r'D:\localE\code\daguang_extract\BERT-BiLSTM-CRF-NER-tjl' args = get_args_parser() args.clean = True args.max_seq_length = 128 args.do_train = True args.output_dir = os.path.join(root_path, 'output') args.num_train_epochs = 30 args.learning_rate = 1e-4 args.warmup_proportion = 0.1 if True: import sys param_str = '\n'.join( ['%20s = %s' % (k, v) for k, v in sorted(vars(args).items())]) print('usage: %s\n%20s %s\n%s\n%s\n' % (' '.join(sys.argv), 'ARG', 'VALUE', '_' * 50, param_str)) print(args) os.environ['CUDA_VISIBLE_DEVICES'] = args.device_map train(args=args)
基于terminal_predict.py进行的修改,用于处理资讯新闻的人名和机构名等的抽取 @Author: geodgechen """ import tensorflow as tf import numpy as np import codecs import pickle import os from datetime import datetime from bert_base.train.models import create_model, InputFeatures from bert_base.bert import tokenization, modeling from bert_base.train.train_helper import get_args_parser args = get_args_parser() model_dir = '/data/label/ner_extract/output' bert_dir = '/data/label/ner_extract/chinese_L-12_H-768_A-12' is_training = False use_one_hot_embeddings = False batch_size = 1 gpu_config = tf.ConfigProto() gpu_config.gpu_options.allow_growth = True sess = tf.Session(config=gpu_config) model = None global graph input_ids_p, input_mask_p, label_ids_p, segment_ids_p = None, None, None, None
def zjb_eval(output_dir, data_dir, max_seq_length, vocab_file, batch_size, learning_rate, init_checkpoint, bert_config_file, num_train_epochs): tf.logging.set_verbosity(tf.logging.INFO) from bert_base.train.train_helper import get_args_parser args = get_args_parser() token_path = os.path.join(output_dir, "token_test.txt") if os.path.exists(token_path): os.remove(token_path) with codecs.open(os.path.join(output_dir, 'label2id.pkl'), 'rb') as rf: label2id = pickle.load(rf) id2label = {value: key for key, value in label2id.items()} tokenizer = tokenization.FullTokenizer(vocab_file=vocab_file, do_lower_case=args.do_lower_case) processors = {"ner": NerProcessor} processor = processors[args.ner](output_dir) label_list = processor.get_labels() predict_examples = processor.get_test_examples(data_dir) predict_file = os.path.join(output_dir, "predict.tf_record") filed_based_convert_examples_to_features(predict_examples, label_list, max_seq_length, tokenizer, predict_file, output_dir, mode="test") tf.logging.info("***** Running prediction*****") tf.logging.info(" Num examples = %d", len(predict_examples)) tf.logging.info(" Batch size = %d", batch_size) predict_drop_remainder = False predict_input_fn = file_based_input_fn_builder( input_file=predict_file, seq_length=max_seq_length, is_training=False, drop_remainder=predict_drop_remainder) session_config = tf.ConfigProto(log_device_placement=False, inter_op_parallelism_threads=0, intra_op_parallelism_threads=0, allow_soft_placement=True) # session_config.gpu_options.per_process_gpu_memory_fraction = 0.8 # 占用80%显存 run_config = tf.estimator.RunConfig(model_dir=output_dir, save_summary_steps=500, save_checkpoints_steps=500, session_config=session_config) train_examples = processor.get_train_examples(data_dir) num_train_steps = int( len(train_examples) * 1.0 / batch_size * num_train_epochs) if num_train_steps < 1: raise AttributeError('training data is so small...') num_warmup_steps = int(num_train_steps * args.warmup_proportion) bert_config = modeling.BertConfig.from_json_file(bert_config_file) model_fn = model_fn_builder(bert_config=bert_config, num_labels=len(label_list) + 1, init_checkpoint=init_checkpoint, learning_rate=learning_rate, num_train_steps=num_train_steps, num_warmup_steps=num_warmup_steps, args=args) params = {'batch_size': args.batch_size} estimator = tf.estimator.Estimator(model_fn, params=params, config=run_config) result = estimator.predict(input_fn=predict_input_fn) output_predict_file = os.path.join(output_dir, "label_test.txt") def result_to_pair(writer): for predict_line, prediction in zip(predict_examples, result): idx = 0 line = '' line_token = str(predict_line.text).split(' ') label_token = str(predict_line.label).split(' ') len_seq = len(label_token) if len(line_token) != len(label_token): tf.logging.info(predict_line.text) tf.logging.info(predict_line.label) break for id in prediction: if idx >= len_seq: break if id == 0: continue curr_labels = id2label[id] if curr_labels in ['[CLS]', '[SEP]']: continue try: line += line_token[idx] + ' ' + label_token[ idx] + ' ' + curr_labels + '\n' except Exception as e: tf.logging.info(e) tf.logging.info(predict_line.text) tf.logging.info(predict_line.label) line = '' break idx += 1 writer.write(line + '\n') with codecs.open(output_predict_file, 'w', encoding='utf-8') as writer: result_to_pair(writer) from bert_base.train import conlleval eval_result = conlleval.return_report(output_predict_file) print(''.join(eval_result)) # 写结果到文件中 with codecs.open(os.path.join(output_dir, 'predict_score.txt'), 'a', encoding='utf-8') as fd: fd.write(''.join(eval_result)) # filter model if args.filter_adam_var: adam_filter(output_dir)
def Bilstm_CRF_save_model(save_path): #def serving_input_receiver_fn(): # """Serving input_fn that builds features from placeholders # Returns # ------- # tf.estimator.export.ServingInputReceiver # """ # words = tf.placeholder(dtype=tf.string, shape=[None, None], name='words') # nwords = tf.placeholder(dtype=tf.int32, shape=[None], name='nwords') # receiver_tensors = {'words': words, 'nwords': nwords} # features = {'words': words, 'nwords': nwords} # return tf.estimator.export.ServingInputReceiver(features, receiver_tensors) def serving_input_receiver_fn(max_seq_length=128): input_ids = tf.placeholder(dtype=tf.int64, shape=[None, max_seq_length], name='input_ids') #input_mask = tf.placeholder(dtype=tf.int64, shape=[None, max_seq_length], name='input_mask') #segment_ids = tf.placeholder(dtype=tf.int64, shape=[None, max_seq_length], name='segment_ids') #label_ids = tf.placeholder(dtype=tf.int64, shape=[None, max_seq_length], name='label_ids') #receive_tensors = {'input_ids': input_ids, 'input_mask': input_mask, 'segment_ids': segment_ids, # 'label_ids': label_ids} #features = {'input_ids': input_ids, 'input_mask': input_mask, 'segment_ids': segment_ids, "label_ids": label_ids} receive_tensors = {'input_ids': input_ids} features = {'input_ids': input_ids} return tf.estimator.export.ServingInputReceiver( features, receive_tensors) from bert_base.bert import modeling from bert_base.train.bert_lstm_ner import NerProcessor from bert_base.train.bert_lstm_ner import model_fn_builder from bert_base.train.train_helper import get_args_parser args = get_args_parser() bert_config = modeling.BertConfig.from_json_file( './checkpoint1/bert_config.json') processor = NerProcessor('output/result_dir') label_list = processor.get_labels() import ipdb #ipdb.set_trace() session_config = tf.ConfigProto(log_device_placement=False, inter_op_parallelism_threads=0, intra_op_parallelism_threads=0, allow_soft_placement=True) run_config = tf.estimator.RunConfig(model_dir='output/result_dir', save_summary_steps=500, save_checkpoints_steps=500, session_config=session_config) print('len label list is {}'.format(len(label_list))) model_fn = model_fn_builder( bert_config=bert_config, num_labels=len(label_list) + 1, #num_labels=2, init_checkpoint='./checkpoint1/bert_model.ckpt', learning_rate=2e-5, num_train_steps=10, num_warmup_steps=10, args=args) # params是一个dict 里面的key是model_fn 里面用到的参数名称,value是对应的数据 params = { 'batch_size': 32, } #estimator = tf.estimator.Estimator(model_fn,'output/result_dir', params=params) estimator = tf.estimator.Estimator(model_fn, 'output/result_dir', params=params, config=run_config) estimator.export_saved_model('{}/saved_model'.format(save_path), serving_input_receiver_fn)