def main(): config = { "model_config_file": "/search/odin/liruihong/tts/multi_attn_model/config_data/classify_config.json", "max_seq_length": 128, "batch_size": 32, "word2vec_file": "/search/odin/liruihong/tts/multi_attn_model/config_data/100000-small.txt", "stop_words_file": "/search/odin/liruihong/tts/multi_attn_model/config_data/cn_stopwords.txt", "init_checkpoint": "/search/odin/liruihong/tts/bert_output/wordvec_attn/annotate_part_unlimitlen/model.ckpt-4600", "model_output_dir": "/search/odin/liruihong/tts/bert_output/wordvec_attn/annotate_part_unlimitlen", "http_port": 9001, } logger = set_logger("root", verbose=True, handler=logging.StreamHandler()) ready_to_classify_que = Queue() classify_res_que = Queue() http_server = HTTPServer(config, ready_to_classify_que, classify_res_que, 1, logger) logger.info("start server") http_server.start() logger.info("finish all start")
def __init__(self, id, args, worker_address, sink_address): super().__init__() self.model_dir = args.model_dir self.config_fp = os.path.join(self.model_dir, 'bert_config.json') self.checkpoint_fp = os.path.join(self.model_dir, 'bert_model.ckpt') self.vocab_fp = os.path.join(args.model_dir, 'vocab.txt') self.tokenizer = tokenization.FullTokenizer(vocab_file=self.vocab_fp) self.max_seq_len = args.max_seq_len self.worker_id = id self.daemon = True self.model_fn = model_fn_builder( bert_config=modeling.BertConfig.from_json_file(self.config_fp), init_checkpoint=self.checkpoint_fp, pooling_strategy=args.pooling_strategy, pooling_layer=args.pooling_layer) os.environ['CUDA_VISIBLE_DEVICES'] = str(self.worker_id) config = tf.ConfigProto() config.gpu_options.allow_growth = True config.gpu_options.per_process_gpu_memory_fraction = args.gpu_memory_fraction self.estimator = Estimator(self.model_fn, config=RunConfig(session_config=config)) self.exit_flag = multiprocessing.Event() self.logger = set_logger('WORKER-%d' % self.worker_id) self.worker_address = worker_address self.sink_address = sink_address
def __init__(self, **kwargs): self.tf = import_tf(kwargs['gpu_no'], kwargs['verbose']) self.logger = set_logger('BertNer', kwargs['log_dir'], kwargs['verbose']) self.model_dir = kwargs['ner_model'] from bert.tokenization import FullTokenizer self.tokenizer = FullTokenizer( os.path.join(self.model_dir, 'vocab.txt')) self.ner_sq_len = 128 self.input_ids = self.tf.placeholder(self.tf.int32, (None, self.ner_sq_len), 'input_ids') self.input_mask = self.tf.placeholder(self.tf.int32, (None, self.ner_sq_len), 'input_mask') # init graph self._init_graph() # init ner assist data self._init_predict_var() self.per_proun = [ '甲', '乙', '丙', '丁', '戊', '己', '庚', '辛', '壬', '癸', '子', '丑', '寅', '卯', '辰', '巳', '午', '未', '申', '酉', '戌', '亥' ]
def __init__(self, args): super().__init__() self.model_dir = args.model_dir self.max_seq_len = args.max_seq_len self.num_worker = args.num_worker self.max_batch_size = args.max_batch_size self.port = args.port self.args = args self.args_dict = { 'model_dir': args.model_dir, 'max_seq_len': args.max_seq_len, 'num_worker': args.num_worker, 'max_batch_size': args.max_batch_size, 'port': args.port, 'tensorflow_version': tf.__version__, 'python_version': sys.version, 'server_time': str(datetime.now()) } self.processes = [] self.frontend = None # REQ->ROUTER self.backend = None # PUSH->PULL self.context = None self.exit_flag = threading.Event() self.logger = set_logger('DISPATCHER') self.client_checksum = {} self.pending_client = {} self.pending_checksum = {}
def __init__(self, args, frontend): super().__init__() self.port = args.port self.context = None self.receiver = None self.frontend = frontend self.exit_flag = threading.Event() self.logger = set_logger('SINK')
def __init__(self, args, frontend, client_chk): super().__init__() self.port = args.port self.context = None self.receiver = None self.frontend = frontend self.exit_flag = threading.Event() self.logger = set_logger('SINK') self.address = None self.client_checksum = client_chk
def __init__(self, gpu_no, log_dir, bert_sim_dir, verbose=False): self.bert_sim_dir = bert_sim_dir self.logger = set_logger(colored('BS', 'cyan'), log_dir, verbose) self.tf = import_tf(gpu_no, verbose) # add tokenizer from bert import tokenization self.tokenizer = tokenization.FullTokenizer(os.path.join(bert_sim_dir, 'vocab.txt')) # add placeholder self.input_ids = self.tf.placeholder(self.tf.int32, (None, 45), 'input_ids') self.input_mask = self.tf.placeholder(self.tf.int32, (None, 45), 'input_mask') self.input_type_ids = self.tf.placeholder(self.tf.int32, (None, 45), 'input_type_ids') # init graph self._init_graph()
def __init__(self, args): super().__init__() self.logger = set_logger('VENTILATOR') self.model_dir = args.model_dir self.max_seq_len = args.max_seq_len self.num_worker = args.num_worker self.max_batch_size = args.max_batch_size self.port = args.port self.args = args self.args_dict = { 'model_dir': args.model_dir, 'max_seq_len': args.max_seq_len, 'num_worker': args.num_worker, 'max_batch_size': args.max_batch_size, 'port': args.port, 'port_out': args.port_out, 'pooling_layer': args.pooling_layer, 'pooling_strategy': args.pooling_strategy.value, 'tensorflow_version': tf.__version__, 'python_version': sys.version, 'server_start_time': str(datetime.now()) } self.processes = [] self.context = zmq.Context() # frontend facing client self.frontend = self.context.socket(zmq.PULL) self.frontend.bind('tcp://*:%d' % self.port) # pair connection between frontend and sink self.sink = self.context.socket(zmq.PAIR) self.sink.bind('ipc://*') self.addr_front2sink = self.sink.getsockopt( zmq.LAST_ENDPOINT).decode('ascii') # backend facing workers self.backend = self.context.socket(zmq.PUSH) self.backend.bind('ipc://*') self.addr_backend = self.backend.getsockopt( zmq.LAST_ENDPOINT).decode('ascii') # start the sink thread proc_sink = BertSink(self.args, self.addr_front2sink) proc_sink.start() self.processes.append(proc_sink) self.addr_sink = self.sink.recv().decode('ascii') self.logger.info('frontend-sink ipc: %s' % self.addr_sink)
def __init__(self, id, args): super().__init__() self.model_dir = args.model_dir self.config_fp = os.path.join(self.model_dir, 'bert_config.json') self.checkpoint_fp = os.path.join(self.model_dir, 'bert_model.ckpt') self.vocab_fp = os.path.join(args.model_dir, 'vocab.txt') self.tokenizer = tokenization.FullTokenizer(vocab_file=self.vocab_fp) self.max_seq_len = args.max_seq_len self.worker_id = id self.daemon = True self.model_fn = model_fn_builder( bert_config=modeling.BertConfig.from_json_file(self.config_fp), init_checkpoint=self.checkpoint_fp) os.environ['CUDA_VISIBLE_DEVICES'] = str(self.worker_id) self.estimator = Estimator(self.model_fn) self.exit_flag = multiprocessing.Event() self.logger = set_logger('WORKER-%d' % self.worker_id)
def __init__(self, args, front_sink_addr): super().__init__() self.port = args.port_out self.exit_flag = multiprocessing.Event() self.logger = set_logger('SINK') self.front_sink_addr = front_sink_addr
import os import pickle import threading import time from multiprocessing import Process import tensorflow as tf import zmq from tensorflow.python.estimator.estimator import Estimator from bert import tokenization, modeling from bert.extract_features import model_fn_builder, convert_lst_to_features from helper import set_logger logger = set_logger() class BertServer(threading.Thread): def __init__(self, args): super().__init__() self.model_dir = args.model_dir self.max_len = args.max_len self.num_worker = args.num_worker self.port = args.port self.args = args def run(self): context = zmq.Context.instance() frontend = context.socket(zmq.ROUTER) frontend.bind('tcp://*:%d' % self.port)
"input_type_ids": tf.int32, }, output_shapes={ "unique_ids": [None], "input_ids": [None, None], "input_mask": [None, None], "input_type_ids": [None, None], })) return input_fn args = get_run_args() logger = set_logger(colored('VENTILATOR', 'magenta'), args.verbose) graph_path, bert_config = optimize_graph(args=args) if graph_path: logger.info('optimized graph is stored at: %s' % graph_path) logger.info('use device %s, load graph from %s' % ('cpu', graph_path)) tf = import_tf(device_id=-1, verbose=args.verbose, use_fp16=args.fp16) estimator = get_estimator(args=args, tf=tf, graph_path=graph_path) save_hook = tf.train.CheckpointSaverHook(checkpoint_dir=args.checkpoint_dir, save_secs=1) predicts = estimator.predict(input_fn=input_fn_builder(), hooks=[save_hook]) for predict in predicts:
def optimize_graph(args, logger=None): if not logger: logger = set_logger(colored('GRAPHOPT', 'cyan'), args.verbose) try: # we don't need GPU for optimizing the graph tf = import_tf(verbose=args.verbose) from tensorflow.python.tools.optimize_for_inference_lib import optimize_for_inference config = tf.ConfigProto(device_count={'GPU': 0}, allow_soft_placement=True) config_fp = os.path.join(args.model_dir, args.config_name) init_checkpoint = os.path.join(args.tuned_model_dir or args.model_dir, args.ckpt_name) if args.fp16: logger.warning( 'fp16 is turned on! ' 'Note that not all CPU GPU support fast fp16 instructions, ' 'worst case you will have degraded performance!') logger.info('model config: %s' % config_fp) logger.info('checkpoint%s: %s' % (' (override by the fine-tuned model)' if args.tuned_model_dir else '', init_checkpoint)) with tf.gfile.GFile(config_fp, 'r') as f: bert_config = modeling.BertConfig.from_dict(json.load(f)) logger.info('build graph...') # input placeholders, not sure if they are friendly to XLA input_ids = tf.placeholder(tf.int32, (None, None), 'input_ids') input_mask = tf.placeholder(tf.int32, (None, None), 'input_mask') input_type_ids = tf.placeholder(tf.int32, (None, None), 'input_type_ids') jit_scope = tf.contrib.compiler.jit.experimental_jit_scope if args.xla else contextlib.suppress with jit_scope(): input_tensors = [input_ids, input_mask, input_type_ids] model = modeling.BertModel(config=bert_config, is_training=False, input_ids=input_ids, input_mask=input_mask, token_type_ids=input_type_ids, use_one_hot_embeddings=False) tvars = tf.trainable_variables() (assignment_map, initialized_variable_names ) = modeling.get_assignment_map_from_checkpoint( tvars, init_checkpoint) tf.train.init_from_checkpoint(init_checkpoint, assignment_map) minus_mask = lambda x, m: x - tf.expand_dims(1.0 - m, axis=-1 ) * 1e30 mul_mask = lambda x, m: x * tf.expand_dims(m, axis=-1) masked_reduce_max = lambda x, m: tf.reduce_max(minus_mask(x, m), axis=1) masked_reduce_mean = lambda x, m: tf.reduce_sum( mul_mask(x, m), axis=1) / (tf.reduce_sum( m, axis=1, keepdims=True) + 1e-10) with tf.variable_scope("pooling"): if len(args.pooling_layer) == 1: encoder_layer = model.all_encoder_layers[ args.pooling_layer[0]] else: all_layers = [ model.all_encoder_layers[l] for l in args.pooling_layer ] encoder_layer = tf.concat(all_layers, -1) input_mask = tf.cast(input_mask, tf.float32) if args.pooling_strategy == PoolingStrategy.REDUCE_MEAN: pooled = masked_reduce_mean(encoder_layer, input_mask) elif args.pooling_strategy == PoolingStrategy.REDUCE_MAX: pooled = masked_reduce_max(encoder_layer, input_mask) elif args.pooling_strategy == PoolingStrategy.REDUCE_MEAN_MAX: pooled = tf.concat([ masked_reduce_mean(encoder_layer, input_mask), masked_reduce_max(encoder_layer, input_mask) ], axis=1) elif args.pooling_strategy == PoolingStrategy.FIRST_TOKEN or \ args.pooling_strategy == PoolingStrategy.CLS_TOKEN: pooled = tf.squeeze(encoder_layer[:, 0:1, :], axis=1) elif args.pooling_strategy == PoolingStrategy.LAST_TOKEN or \ args.pooling_strategy == PoolingStrategy.SEP_TOKEN: seq_len = tf.cast(tf.reduce_sum(input_mask, axis=1), tf.int32) rng = tf.range(0, tf.shape(seq_len)[0]) indexes = tf.stack([rng, seq_len - 1], 1) pooled = tf.gather_nd(encoder_layer, indexes) elif args.pooling_strategy == PoolingStrategy.NONE: pooled = mul_mask(encoder_layer, input_mask) else: raise NotImplementedError() if args.fp16: pooled = tf.cast(pooled, tf.float16) pooled = tf.identity(pooled, 'final_encodes') output_tensors = [pooled] tmp_g = tf.get_default_graph().as_graph_def() with tf.Session(config=config) as sess: logger.info('load parameters from checkpoint...') sess.run(tf.global_variables_initializer()) dtypes = [n.dtype for n in input_tensors] logger.info('optimize...') tmp_g = optimize_for_inference( tmp_g, [n.name[:-2] for n in input_tensors], [n.name[:-2] for n in output_tensors], [dtype.as_datatype_enum for dtype in dtypes], False) logger.info('freeze...') tmp_g = convert_variables_to_constants( sess, tmp_g, [n.name[:-2] for n in output_tensors], use_fp16=args.fp16) tmp_file = tempfile.NamedTemporaryFile('w', delete=False, dir=args.graph_tmp_dir).name logger.info('write graph to a tmp file: %s' % tmp_file) with tf.gfile.GFile(tmp_file, 'wb') as f: f.write(tmp_g.SerializeToString()) return tmp_file, bert_config except Exception: logger.error('fail to optimize the graph!', exc_info=True)