def start_bundle(self): bert_config = modeling.BertConfig.from_json_file( self._bert_config_file) self._tokenizer = tokenization.FullTokenizer( vocab_file=self._vocab_file, do_lower_case=self._do_lower_case) is_per_host = tf.compat.v1.estimator.tpu.InputPipelineConfig.PER_HOST_V2 run_config = tf.compat.v1.estimator.tpu.RunConfig( master=None, tpu_config=tf.compat.v1.estimator.tpu.TPUConfig( num_shards=1, per_host_input_for_training=is_per_host)) model_fn = extract_features.model_fn_builder( bert_config=bert_config, init_checkpoint=self._init_checkpoint, layer_indexes=self._layer_indexes, use_tpu=False, use_one_hot_embeddings=False) self._estimator = tf.compat.v1.estimator.tpu.TPUEstimator( use_tpu=False, model_fn=model_fn, config=run_config, predict_batch_size=1)
def _build_bert_model(self): # load pre-trained model config bert_config_file = self.bert_model_dir + "bert_config.json" bert_config = BertConfig.from_json_file(bert_config_file) # code to facilitate TPU usage - not used in this case so can be overlooked is_per_host = tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2 run_config = tf.contrib.tpu.RunConfig( master=None, tpu_config=tf.contrib.tpu.TPUConfig( num_shards=8, per_host_input_for_training=is_per_host) ) # then load build BERT model checkpoint_file = self.bert_model_dir + 'bert_model.ckpt' model_fn = model_fn_builder( bert_config = bert_config, # the bert_model.ckpt file is actually three files, but is referenced as one init_checkpoint = checkpoint_file, layer_indexes = self.layer_indexes, use_tpu = False, # extract_features script reccomends this to be set to true if using TPU # apparently much faster use_one_hot_embeddings = False ) estimator = tf.contrib.tpu.TPUEstimator( use_tpu=False, model_fn=model_fn, config=run_config, predict_batch_size=32 ) return estimator
def __init__(self, id, args, worker_address, sink_address): super().__init__() self.model_dir = args.model_dir self.config_fp = os.path.join(self.model_dir, 'bert_config.json') self.checkpoint_fp = os.path.join(self.model_dir, 'bert_model.ckpt') self.vocab_fp = os.path.join(args.model_dir, 'vocab.txt') self.tokenizer = tokenization.FullTokenizer(vocab_file=self.vocab_fp) self.max_seq_len = args.max_seq_len self.worker_id = id self.daemon = True self.model_fn = model_fn_builder( bert_config=modeling.BertConfig.from_json_file(self.config_fp), init_checkpoint=self.checkpoint_fp, pooling_strategy=args.pooling_strategy, pooling_layer=args.pooling_layer) os.environ['CUDA_VISIBLE_DEVICES'] = str(self.worker_id) config = tf.ConfigProto() config.gpu_options.allow_growth = True config.gpu_options.per_process_gpu_memory_fraction = args.gpu_memory_fraction self.estimator = Estimator(self.model_fn, config=RunConfig(session_config=config)) self.exit_flag = multiprocessing.Event() self.logger = set_logger('WORKER-%d' % self.worker_id) self.worker_address = worker_address self.sink_address = sink_address
def prepare_bert(bert_path, bert_config_file, bert_vocab_file, init_checkpoint, select_layers): bert_config = modeling.BertConfig.from_json_file(bert_config_file) model_fn = model_fn_builder(bert_config=bert_config, init_checkpoint=init_checkpoint, layer_indexes=select_layers, use_tpu=False, use_one_hot_embeddings=False) estimator = tf.contrib.tpu.TPUEstimator(model_fn=model_fn, model_dir=bert_path, use_tpu=False, predict_batch_size=32, config=tf.contrib.tpu.RunConfig()) #config=tf.contrib.tpu.RunConfig(master=None, tpu_config=tf.contrib.tpu.TPUConfig(num_shards=8, per_host_input_for_training=tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2))) #config = tf.ConfigProto() #config.gpu_options.allow_growth = True #config.gpu_options.per_process_gpu_memory_fraction = 0.3 #estimator = Estimator(model_fn, config=RunConfig(session_config=config), params = {'batch_size': 32}, model_dir=MODEL_DIR) tokenizer = tokenization.FullTokenizer(vocab_file=bert_vocab_file, do_lower_case=False) return estimator, tokenizer
def __init__(self, id, args): super().__init__() self.model_dir = args.model_dir self.config_fp = os.path.join(self.model_dir, 'bert_config.json') self.checkpoint_fp = os.path.join(self.model_dir, 'bert_model.ckpt') self.vocab_fp = os.path.join(args.model_dir, 'vocab.txt') self.tokenizer = tokenization.FullTokenizer(vocab_file=self.vocab_fp) self.max_len = args.max_len self.worker_id = id self.daemon = True self.model_fn = model_fn_builder( bert_config=modeling.BertConfig.from_json_file(self.config_fp), init_checkpoint=self.checkpoint_fp) os.environ['CUDA_VISIBLE_DEVICES'] = str(self.worker_id) self.estimator = Estimator(self.model_fn) self.result = []
def __init__(self, id, args): super().__init__() self.model_dir = args.model_dir self.config_fp = os.path.join(self.model_dir, 'bert_config.json') self.checkpoint_fp = os.path.join(self.model_dir, 'bert_model.ckpt') self.vocab_fp = os.path.join(args.model_dir, 'vocab.txt') self.tokenizer = tokenization.FullTokenizer(vocab_file=self.vocab_fp) self.max_seq_len = args.max_seq_len self.worker_id = id self.daemon = True self.model_fn = model_fn_builder( bert_config=modeling.BertConfig.from_json_file(self.config_fp), init_checkpoint=self.checkpoint_fp) os.environ['CUDA_VISIBLE_DEVICES'] = str(self.worker_id) self.estimator = Estimator(self.model_fn) self.dest = None self._start_t = time.perf_counter() self.socket = None self.exit_flag = multiprocessing.Event()
def __init__(self, id, args): super().__init__() self.model_dir = args.model_dir self.config_fp = os.path.join(self.model_dir, 'bert_config.json') self.checkpoint_fp = os.path.join(self.model_dir, 'bert_model.ckpt') self.vocab_fp = os.path.join(args.model_dir, 'vocab.txt') self.tokenizer = tokenization.FullTokenizer(vocab_file=self.vocab_fp) self.max_seq_len = args.max_seq_len self.worker_id = id self.daemon = True self.model_fn = model_fn_builder( bert_config=modeling.BertConfig.from_json_file(self.config_fp), init_checkpoint=self.checkpoint_fp, pooling_strategy=args.pooling_strategy, pooling_layer=args.pooling_layer) os.environ['CUDA_VISIBLE_DEVICES'] = str(self.worker_id) self.estimator = Estimator(self.model_fn) self.exit_flag = multiprocessing.Event() self.logger = set_logger('WORKER-%d' % self.worker_id)
def init(self): bert_config = modeling.BertConfig.from_json_file(CONFIG) model_fn = model_fn_builder(bert_config=bert_config, init_checkpoint=CKPT, layer_indexes=_layers, use_tpu=False, use_one_hot_embeddings=False) self._estimator = tf.contrib.tpu.TPUEstimator( model_fn=model_fn, model_dir=MODEL_DIR, use_tpu=False, predict_batch_size=32, config=tf.contrib.tpu.RunConfig( master=None, tpu_config=tf.contrib.tpu.TPUConfig( num_shards=8, per_host_input_for_training=tf.contrib.tpu. InputPipelineConfig.PER_HOST_V2))) self._tokenizer = tokenization.FullTokenizer(vocab_file=VOCAB, do_lower_case=False)
def __init__(self): # the pooling layer index of bert-original model self.pooling_layer = [-2] # the pooling_strategy of bert-original model self.pooling_strategy = PoolingStrategy.REDUCE_MEAN # "The maximum total input sequence length after WordPiece tokenization. " # "Sequences longer than this will be truncated, and sequences shorter " # "than this will be padded." self.max_seq_len = 128 self.bert_model_dir = sys_conf["bert_dir"] self.config_fp = os.path.join(self.bert_model_dir, "bert_config.json") self.ckpt_fp = os.path.join(self.bert_model_dir, "bert_model.ckpt") self.vocab_fp = os.path.join(self.bert_model_dir, "vocab.txt") self.tokenizer = tokenization.FullTokenizer(vocab_file=self.vocab_fp) self.model_fn = model_fn_builder( bert_config=modeling.BertConfig.from_json_file(self.config_fp), init_checkpoint=self.ckpt_fp, pooling_strategy=self.pooling_strategy, pooling_layer=self.pooling_layer) self.estimator = Estimator(self.model_fn)
from bert import modeling, tokenization from bert.extract_features import model_fn_builder, convert_lst_to_features, PoolingStrategy # 获取当前文件的上层路径 path = os.path.dirname(os.path.abspath(__file__)) model_dir = "/Users/yucong/PycharmProjects/helloAi/bert" config_fp = os.path.join(model_dir, 'bert_config.json') checkpoint_fp = os.path.join(model_dir, 'bert_model.ckpt') vocab_fp = os.path.join(model_dir, 'vocab.txt') tokenizer = tokenization.FullTokenizer(vocab_file=vocab_fp) max_seq_len = 10 worker_id = id daemon = True model_fn = model_fn_builder( bert_config=modeling.BertConfig.from_json_file(config_fp), init_checkpoint=checkpoint_fp, pooling_strategy=PoolingStrategy.NONE, pooling_layer=[-2]) config = tf.ConfigProto() config.gpu_options.allow_growth = True config.gpu_options.per_process_gpu_memory_fraction = 0.3 estimator = Estimator(model_fn, config=RunConfig(session_config=config), model_dir=None) def input_fn_builder(msg): def gen(): for i in range(1): tmp_f = list(convert_lst_to_features(msg, max_seq_len, tokenizer)) yield {